Fix flaky test: test_shutsdown_cleanly (#8582) #2254
24 fail, 109 skipped, 3 922 pass in 11h 5m 29s
29 files 29 suites 11h 5m 29s ⏱️
4 055 tests 3 922 ✅ 109 💤 24 ❌
54 888 runs 52 233 ✅ 2 409 💤 246 ❌
Results for commit fa4976e.
Annotations
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[True-inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 9s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 6s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 6s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56476', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i == … a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[True-outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 9s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 7s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56518', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i == … a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[True-left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 6s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 5s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 6s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56568', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i == j… a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[True-right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 9s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 6s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56614', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i == … a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[False-inner] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 5s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 6s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56660', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'inner', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i ==… a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[False-outer] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 9s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 6s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 7s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56700', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'outer', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i ==… a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[False-left] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 9s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 6s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 6s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: 'tcp://127.0.0.1:56750' processes=2 threads=3, memory=32.00 GiB>
s = <Scheduler 'tcp://127.0.0.1:56750', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'left', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axi… a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
10 out of 12 runs failed: test_merge[False-right] (distributed.shuffle.tests.test_merge)
artifacts/macos-latest-3.12-default-ci1/pytest.xml [took 9s]
artifacts/ubuntu-latest-3.10-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.11-default-ci1/pytest.xml [took 4s]
artifacts/ubuntu-latest-3.12-default-ci1/pytest.xml [took 5s]
artifacts/ubuntu-latest-3.9-default-ci1/pytest.xml [took 6s]
artifacts/ubuntu-latest-3.9-no_queue-ci1/pytest.xml [took 5s]
artifacts/windows-latest-3.10-default-ci1/pytest.xml [took 6s]
artifacts/windows-latest-3.11-default-ci1/pytest.xml [took 7s]
artifacts/windows-latest-3.12-default-ci1/pytest.xml [took 8s]
artifacts/windows-latest-3.9-default-ci1/pytest.xml [took 6s]
Raw output
pandas.errors.SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:56793', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: from_pd_divs, 1 expression
Expr=df
how = 'right', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
await list_eq(joined, pd.merge(A, B, on="y", how=how))
assert all(d is None for d in joined.divisions)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how),
pd.merge(A, B, left_on="x", right_on="z", how=how),
)
await list_eq(
dd.merge(a, b, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
pd.merge(A, B, left_on="x", right_on="z", how=how, suffixes=("1", "2")),
)
await list_eq(dd.merge(a, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(a, B, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, b, how=how), pd.merge(A, B, how=how))
await list_eq(dd.merge(A, B, how=how), pd.merge(A, B, how=how))
await list_eq(
dd.merge(a, b, left_index=True, right_index=True, how=how),
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
await list_eq(
dd.merge(
a, b, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
pd.merge(
A, B, left_index=True, right_index=True, how=how, suffixes=("1", "2")
),
)
> await list_eq(
dd.merge(a, b, left_on="x", right_index=True, how=how),
pd.merge(A, B, left_on="x", right_index=True, how=how),
)
distributed\shuffle\tests\test_merge.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\shuffle\tests\test_merge.py:36: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed\client.py:336: in _result
raise exc.with_traceback(tb)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\dask_expr\_merge.py:775: in assign_index_merge_transfer
index["_index"] = df.index
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4299: in __setitem__
self._set_item(key, value)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4526: in _set_item
self._set_item_mgr(key, value, refs)
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\frame.py:4484: in _set_item_mgr
self._check_setitem_copy()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# pyright: reportPropertyTypeMismatch=false
from __future__ import annotations
import collections
from copy import deepcopy
import datetime as dt
from functools import partial
import gc
from json import loads
import operator
import pickle
import re
import sys
from typing import (
TYPE_CHECKING,
Any,
Callable,
ClassVar,
Literal,
NoReturn,
cast,
final,
overload,
)
import warnings
import weakref
import numpy as np
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._libs import lib
from pandas._libs.lib import is_range_indexer
from pandas._libs.tslibs import (
Period,
Tick,
Timestamp,
to_offset,
)
from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
from pandas._typing import (
AlignJoin,
AnyArrayLike,
ArrayLike,
Axes,
Axis,
AxisInt,
CompressionOptions,
DtypeArg,
DtypeBackend,
DtypeObj,
FilePath,
FillnaOptions,
FloatFormatType,
FormattersType,
Frequency,
IgnoreRaise,
IndexKeyFunc,
IndexLabel,
InterpolateOptions,
IntervalClosedType,
JSONSerializable,
Level,
Manager,
NaPosition,
NDFrameT,
OpenFileErrors,
RandomState,
ReindexMethod,
Renamer,
Scalar,
Self,
SequenceNotStr,
SortKind,
StorageOptions,
Suffixes,
T,
TimeAmbiguous,
TimedeltaConvertibleTypes,
TimeNonexistent,
TimestampConvertibleTypes,
TimeUnit,
ValueKeyFunc,
WriteBuffer,
WriteExcelBuffer,
npt,
)
from pandas.compat import PYPY
from pandas.compat._constants import REF_COUNT
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError,
ChainedAssignmentError,
InvalidIndexError,
SettingWithCopyError,
SettingWithCopyWarning,
_chained_assignment_method_msg,
_chained_assignment_warning_method_msg,
_check_cacher,
)
from pandas.util._decorators import (
deprecate_nonkeyword_arguments,
doc,
)
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import (
check_dtype_backend,
validate_ascending,
validate_bool_kwarg,
validate_fillna_kwargs,
validate_inclusive,
)
from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.common import (
ensure_object,
ensure_platform_int,
ensure_str,
is_bool,
is_bool_dtype,
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_number,
is_numeric_dtype,
is_re_compilable,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
)
from pandas.core.dtypes.inference import (
is_hashable,
is_nested_list_like,
)
from pandas.core.dtypes.missing import (
isna,
notna,
)
from pandas.core import (
algorithms as algos,
arraylike,
common,
indexing,
missing,
nanops,
sample,
)
from pandas.core.array_algos.replace import should_use_regex
from pandas.core.arrays import ExtensionArray
from pandas.core.base import PandasObject
from pandas.core.construction import extract_array
from pandas.core.flags import Flags
from pandas.core.indexes.api import (
DatetimeIndex,
Index,
MultiIndex,
PeriodIndex,
RangeIndex,
default_index,
ensure_index,
)
from pandas.core.internals import (
ArrayManager,
BlockManager,
SingleArrayManager,
)
from pandas.core.internals.construction import (
mgr_to_mgr,
ndarray_to_mgr,
)
from pandas.core.methods.describe import describe_ndframe
from pandas.core.missing import (
clean_fill_method,
clean_reindex_fill_method,
find_valid_index,
)
from pandas.core.reshape.concat import concat
from pandas.core.shared_docs import _shared_docs
from pandas.core.sorting import get_indexer_indexer
from pandas.core.window import (
Expanding,
ExponentialMovingWindow,
Rolling,
Window,
)
from pandas.io.formats.format import (
DataFrameFormatter,
DataFrameRenderer,
)
from pandas.io.formats.printing import pprint_thing
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Iterator,
Mapping,
Sequence,
)
from pandas._libs.tslibs import BaseOffset
from pandas import (
DataFrame,
ExcelWriter,
HDFStore,
Series,
)
from pandas.core.indexers.objects import BaseIndexer
from pandas.core.resample import Resampler
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs = {**_shared_docs}
_shared_doc_kwargs = {
"axes": "keywords for axes",
"klass": "Series/DataFrame",
"axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
"inplace": """
inplace : bool, default False
If True, performs operation inplace and returns None.""",
"optional_by": """
by : str or list of str
Name or list of names to sort by""",
}
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: list[str] = [
"_mgr",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_name",
"_metadata",
"_flags",
]
_internal_names_set: set[str] = set(_internal_names)
_accessors: set[str] = set()
_hidden_attrs: frozenset[str] = frozenset([])
_metadata: list[str] = []
_is_copy: weakref.ReferenceType[NDFrame] | str | None = None
_mgr: Manager
_attrs: dict[Hashable, Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(self, data: Manager) -> None:
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_mgr", data)
object.__setattr__(self, "_item_cache", {})
object.__setattr__(self, "_attrs", {})
object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
@final
@classmethod
def _init_mgr(
cls,
mgr: Manager,
axes: dict[Literal["index", "columns"], Axes | None],
dtype: DtypeObj | None = None,
copy: bool_t = False,
) -> Manager:
"""passed a manager and a axes dict"""
for a, axe in axes.items():
if axe is not None:
axe = ensure_index(axe)
bm_axis = cls._get_block_manager_axis(a)
mgr = mgr.reindex_axis(axe, axis=bm_axis)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if (
isinstance(mgr, BlockManager)
and len(mgr.blocks) == 1
and mgr.blocks[0].values.dtype == dtype
):
pass
else:
mgr = mgr.astype(dtype=dtype)
return mgr
@final
def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
"""
Private helper function to create a DataFrame with specific manager.
Parameters
----------
typ : {"block", "array"}
copy : bool, default True
Only controls whether the conversion from Block->ArrayManager
copies the 1D arrays (to ensure proper/contiguous memory layout).
Returns
-------
DataFrame
New DataFrame using specified manager type. Is not guaranteed
to be a copy or not.
"""
new_mgr: Manager
new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
# fastpath of passing a manager doesn't check the option/manager class
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
@classmethod
def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
"""
Construct a new object of this type from a Manager object and axes.
Parameters
----------
mgr : Manager
Must have the same ndim as cls.
axes : list[Index]
Notes
-----
The axes must match mgr.axes, but are required for future-proofing
in the event that axes are refactored out of the Manager objects.
"""
obj = cls.__new__(cls)
NDFrame.__init__(obj, mgr)
return obj
# ----------------------------------------------------------------------
# attrs and flags
@property
def attrs(self) -> dict[Hashable, Any]:
"""
Dictionary of global attributes of this dataset.
.. warning::
attrs is experimental and may change without warning.
See Also
--------
DataFrame.flags : Global flags applying to this object.
Notes
-----
Many operations that create new datasets will copy ``attrs``. Copies
are always deep so that changing ``attrs`` will only affect the
present dataset. ``pandas.concat`` copies ``attrs`` only if all input
datasets have the same ``attrs``.
Examples
--------
For Series:
>>> ser = pd.Series([1, 2, 3])
>>> ser.attrs = {"A": [10, 20, 30]}
>>> ser.attrs
{'A': [10, 20, 30]}
For DataFrame:
>>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
>>> df.attrs = {"A": [10, 20, 30]}
>>> df.attrs
{'A': [10, 20, 30]}
"""
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = dict(value)
@final
@property
def flags(self) -> Flags:
"""
Get the properties associated with this pandas object.
The available flags are
* :attr:`Flags.allows_duplicate_labels`
See Also
--------
Flags : Flags that apply to pandas objects.
DataFrame.attrs : Global metadata applying to this dataset.
Notes
-----
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags
<Flags(allows_duplicate_labels=True)>
Flags can be get or set using ``.``
>>> df.flags.allows_duplicate_labels
True
>>> df.flags.allows_duplicate_labels = False
Or by slicing with a key
>>> df.flags["allows_duplicate_labels"]
False
>>> df.flags["allows_duplicate_labels"] = True
"""
return self._flags
@final
def set_flags(
self,
*,
copy: bool_t = False,
allows_duplicate_labels: bool_t | None = None,
) -> Self:
"""
Return a new object with updated flags.
Parameters
----------
copy : bool, default False
Specify if a copy of the object should be made.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
allows_duplicate_labels : bool, optional
Whether the returned object allows duplicate labels.
Returns
-------
Series or DataFrame
The same type as the caller.
See Also
--------
DataFrame.attrs : Global metadata applying to this dataset.
DataFrame.flags : Global flags applying to this object.
Notes
-----
This method returns a new object that's a view on the same data
as the input. Mutating the input or the output values will be reflected
in the other.
This method is intended to be used in method chains.
"Flags" differ from "metadata". Flags reflect properties of the
pandas object (the Series or DataFrame). Metadata refer to properties
of the dataset, and should be stored in :attr:`DataFrame.attrs`.
Examples
--------
>>> df = pd.DataFrame({"A": [1, 2]})
>>> df.flags.allows_duplicate_labels
True
>>> df2 = df.set_flags(allows_duplicate_labels=False)
>>> df2.flags.allows_duplicate_labels
False
"""
df = self.copy(deep=copy and not using_copy_on_write())
if allows_duplicate_labels is not None:
df.flags["allows_duplicate_labels"] = allows_duplicate_labels
return df
@final
@classmethod
def _validate_dtype(cls, dtype) -> DtypeObj | None:
"""validate the passed dtype"""
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented "
f"in the {cls.__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self) -> Callable[..., Self]:
"""
Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Internals
@final
@property
def _data(self):
# GH#33054 retained because some downstream packages uses this,
# e.g. fastparquet
# GH#33333
warnings.warn(
f"{type(self).__name__}._data is deprecated and will be removed in "
"a future version. Use public APIs instead.",
DeprecationWarning,
stacklevel=find_stack_level(),
)
return self._mgr
# ----------------------------------------------------------------------
# Axis
_AXIS_ORDERS: list[Literal["index", "columns"]]
_AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
_info_axis_number: int
_info_axis_name: Literal["index", "columns"]
_AXIS_LEN: int
@final
def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
# error: Argument 1 to "update" of "MutableMapping" has incompatible type
# "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
d.update(kwargs) # type: ignore[arg-type]
return d
@final
@classmethod
def _get_axis_number(cls, axis: Axis) -> AxisInt:
try:
return cls._AXIS_TO_AXIS_NUMBER[axis]
except KeyError:
raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
@final
@classmethod
def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
axis_number = cls._get_axis_number(axis)
return cls._AXIS_ORDERS[axis_number]
@final
def _get_axis(self, axis: Axis) -> Index:
axis_number = self._get_axis_number(axis)
assert axis_number in {0, 1}
return self.index if axis_number == 0 else self.columns
@final
@classmethod
def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
ndim = cls._AXIS_LEN
if ndim == 2:
# i.e. DataFrame
return 1 - axis
return axis
@final
def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
# index or columns
axis_index = getattr(self, axis)
d = {}
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
@final
def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
from pandas.core.computation.parsing import clean_column_name
d: dict[str, Series | MultiIndex] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
@final
def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
from pandas.core.series import Series
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): Series(
v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
).__finalize__(self)
for k, v in zip(self.columns, self._iter_column_arrays())
if not isinstance(k, int)
}
@final
@property
def _info_axis(self) -> Index:
return getattr(self, self._info_axis_name)
def _is_view_after_cow_rules(self):
# Only to be used in cases of chained assignment checks, this is a
# simplified check that assumes that either the whole object is a view
# or a copy
if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
return False
return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
@property
def shape(self) -> tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> list[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@final
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._mgr.ndim
@final
@property
def size(self) -> int:
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return int(np.prod(self.shape))
def set_axis(
self,
labels,
*,
axis: Axis = 0,
copy: bool_t | None = None,
) -> Self:
"""
Assign desired index to given axis.
Indexes for%(extended_summary_sub)s row labels can be changed by assigning
a list-like or Index.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : %(axes_single_arg)s, default 0
The axis to update. The value 0 identifies the rows. For `Series`
this parameter is unused and defaults to 0.
copy : bool, default True
Whether to make a copy of the underlying data.
.. note::
The `copy` keyword will change behavior in pandas 3.0.
`Copy-on-Write
<https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
will be enabled by default, which means that all methods with a
`copy` keyword will use a lazy copy mechanism to defer the copy and
ignore the `copy` keyword. The `copy` keyword will be removed in a
future version of pandas.
You can already get the future behavior and improvements through
enabling copy on write ``pd.options.mode.copy_on_write = True``
Returns
-------
%(klass)s
An object of type %(klass)s.
See Also
--------
%(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
"""
return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
@final
def _set_axis_nocheck(
self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
):
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
# With copy=False, we create a new object but don't copy the
# underlying data.
obj = self.copy(deep=copy and not using_copy_on_write())
setattr(obj, obj._get_axis_name(axis), labels)
return obj
@final
def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
"""
This is called from the cython code when we set the `index` attribute
directly, e.g. `series.index = [1, 2, 3]`.
"""
labels = ensure_index(labels)
self._mgr.set_axis(axis, labels)
self._clear_item_cache()
@final
def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
"""
Interchange axes and swap values axes appropriately.
.. deprecated:: 2.1.0
``swapaxes`` is deprecated and will be removed.
Please use ``transpose`` instead.
Returns
-------
same as input
Examples
--------
Please see examples for :meth:`DataFrame.transpose`.
"""
warnings.warn(
# GH#51946
f"'{type(self).__name__}.swapaxes' is deprecated and "
"will be removed in a future version. "
f"Please use '{type(self).__name__}.transpose' instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i ==… a list
will call the method numerous times.
format : dict, list of dict
Keyword args to pass to the method call of ``Styler.format``. If a list will
call the method numerous times.
format_index : dict, list of dict
Keyword args to pass to the method call of ``Styler.format_index``. If a
list will call the method numerous times.
render_kwargs : dict
Keyword args to pass to the method call of ``Styler.to_latex``.
Returns
-------
str or None
If buf is None, returns the result as a string. Otherwise returns None.
"""
from pandas.io.formats.style import Styler
self = cast("DataFrame", self)
styler = Styler(self, uuid="")
for kw_name in ["hide", "relabel_index", "format", "format_index"]:
kw = vars()[kw_name]
if isinstance(kw, dict):
getattr(styler, kw_name)(**kw)
elif isinstance(kw, list):
for sub_kw in kw:
getattr(styler, kw_name)(**sub_kw)
# bold_rows is not a direct kwarg of Styler.to_latex
render_kwargs = {} if render_kwargs is None else render_kwargs
if render_kwargs.pop("bold_rows"):
styler.map_index(lambda v: "textbf:--rwrap;")
return styler.to_latex(buf=buf, **render_kwargs)
@overload
def to_csv(
self,
path_or_buf: None = ...,
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> str:
...
@overload
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
sep: str = ...,
na_rep: str = ...,
float_format: str | Callable | None = ...,
columns: Sequence[Hashable] | None = ...,
header: bool_t | list[str] = ...,
index: bool_t = ...,
index_label: IndexLabel | None = ...,
mode: str = ...,
encoding: str | None = ...,
compression: CompressionOptions = ...,
quoting: int | None = ...,
quotechar: str = ...,
lineterminator: str | None = ...,
chunksize: int | None = ...,
date_format: str | None = ...,
doublequote: bool_t = ...,
escapechar: str | None = ...,
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
) -> None:
...
@final
@deprecate_nonkeyword_arguments(
version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
)
@doc(
storage_options=_shared_docs["storage_options"],
compression_options=_shared_docs["compression_options"] % "path_or_buf",
)
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
sep: str = ",",
na_rep: str = "",
float_format: str | Callable | None = None,
columns: Sequence[Hashable] | None = None,
header: bool_t | list[str] = True,
index: bool_t = True,
index_label: IndexLabel | None = None,
mode: str = "w",
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
quotechar: str = '"',
lineterminator: str | None = None,
chunksize: int | None = None,
date_format: str | None = None,
doublequote: bool_t = True,
escapechar: str | None = None,
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Parameters
----------
path_or_buf : str, path object, file-like object, or None, default None
String, path object (implementing os.PathLike[str]), or file-like
object implementing a write() function. If None, the result is
returned as a string. If a non-binary file object is passed, it should
be opened with `newline=''`, disabling universal newlines. If a binary
file object is passed, `mode` might need to contain a `'b'`.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, Callable, default None
Format string for floating point numbers. If a Callable is given, it takes
precedence over other numeric formatting parameters, like decimal.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:
- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
is a non-binary file object.
{compression_options}
May be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
Passing compression options as keys in dict is
supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
lineterminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
.. versionchanged:: 1.5.0
Previously was line_terminator, changed for consistency with
read_csv and the standard library 'csv' module.
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
{storage_options}
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
Create 'out.csv' containing 'df' without indices
>>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']}})
>>> df.to_csv('out.csv', index=False) # doctest: +SKIP
Create 'out.zip' containing 'out.csv'
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
To write a csv file to a new folder or nested folder you will first
need to create it using either Pathlib or os:
>>> from pathlib import Path # doctest: +SKIP
>>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
>>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
>>> df.to_csv(filepath) # doctest: +SKIP
>>> import os # doctest: +SKIP
>>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
>>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
formatter = DataFrameFormatter(
frame=df,
header=header,
index=index,
na_rep=na_rep,
float_format=float_format,
decimal=decimal,
)
return DataFrameRenderer(formatter).to_csv(
path_or_buf,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
errors=errors,
compression=compression,
quoting=quoting,
columns=columns,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
storage_options=storage_options,
)
# ----------------------------------------------------------------------
# Lookup Caching
def _reset_cacher(self) -> None:
"""
Reset the cacher.
"""
raise AbstractMethodError(self)
def _maybe_update_cacher(
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
if using_copy_on_write():
return
if verify_is_copy:
self._check_setitem_copy(t="referent")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
raise AbstractMethodError(self)
# ----------------------------------------------------------------------
# Indexing Methods
@final
def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
nv.validate_take((), kwargs)
if not isinstance(indices, slice):
indices = np.asarray(indices, dtype=np.intp)
if (
axis == 0
and indices.ndim == 1
and using_copy_on_write()
and is_range_indexer(indices, len(self))
):
return self.copy(deep=None)
elif self.ndim == 1:
raise TypeError(
f"{type(self).__name__}.take requires a sequence of integers, "
"not slice."
)
else:
warnings.warn(
# GH#51539
f"Passing a slice to {type(self).__name__}.take is deprecated "
"and will raise in a future version. Use `obj[slicer]` or pass "
"a sequence of integers instead.",
FutureWarning,
stacklevel=find_stack_level(),
)
# We can get here with a slice via DataFrame.__getitem__
indices = np.arange(
indices.start, indices.stop, indices.step, dtype=np.intp
)
new_data = self._mgr.take(
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
)
return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
self, method="take"
)
@final
def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
For Series this does the same as the public take (it never sets `_is_copy`).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
@final
def xs(
self,
key: IndexLabel,
axis: Axis = 0,
level: IndexLabel | None = None,
drop_level: bool_t = True,
) -> Self:
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog', 'walks'))
num_legs 4
num_wings 0
Name: (mammal, dog, walks), dtype: int64
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if isinstance(key, list):
raise TypeError("list keys are not supported in xs, pass a tuple instead")
if level is not None:
if not isinstance(labels, MultiIndex):
raise TypeError("Index must be a MultiIndex")
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
if drop_level:
return self[key]
index = self.columns
else:
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
if lib.is_integer(loc):
# Slice index must be an integer or None
new_index = index[loc : loc + 1]
else:
new_index = index[loc]
else:
loc = index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = index[loc]
if is_scalar(loc) and axis == 0:
# In this case loc should be an integer
if self.ndim == 1:
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
return self._values[loc]
new_mgr = self._mgr.fast_xs(loc)
result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
result._name = self.index[loc]
result = result.__finalize__(self)
elif is_scalar(loc):
result = self.iloc[:, slice(loc, loc + 1)]
elif axis == 1:
result = self.iloc[:, loc]
else:
result = self.iloc[loc]
result.index = new_index
# this could be a view
# but only in a single-dtyped view sliceable case
result._set_is_copy(self, copy=not result._is_view)
return result
def __getitem__(self, item):
raise AbstractMethodError(self)
@final
def _getitem_slice(self, key: slice) -> Self:
"""
__getitem__ for the case where the key is a slice object.
"""
# _convert_slice_indexer to determine if this slice is positional
# or label based, and if the latter, convert to positional
slobj = self.index._convert_slice_indexer(key, kind="getitem")
if isinstance(slobj, np.ndarray):
# reachable with DatetimeIndex
indexer = lib.maybe_indices_to_slice(
slobj.astype(np.intp, copy=False), len(self)
)
if isinstance(indexer, np.ndarray):
# GH#43223 If we can not convert, use take
return self.take(indexer, axis=0)
slobj = indexer
return self._slice(slobj)
def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
"""
Construct a slice of this container.
Slicing with this method is *always* positional.
"""
assert isinstance(slobj, slice), type(slobj)
axis = self._get_block_manager_axis(axis)
new_mgr = self._mgr.get_slice(slobj, axis=axis)
result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
result = result.__finalize__(self)
# this could be a view
# but only in a single-dtyped view sliceable case
is_copy = axis != 0 or result._is_view
result._set_is_copy(self, copy=is_copy)
return result
@final
def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
if not copy:
self._is_copy = None
else:
assert ref is not None
self._is_copy = weakref.ref(ref)
def _check_is_chained_assignment_possible(self) -> bool_t:
"""
Check if we are a view, have a cacher, and are of mixed type.
If so, then force a setitem_copy check.
Should be called just near setting a value
Will return a boolean if it we are a view and are cached, but a
single-dtype meaning that the cacher should be updated following
setting.
"""
if self._is_copy:
self._check_setitem_copy(t="referent")
return False
@final
def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
"""
Parameters
----------
t : str, the type of setting error
force : bool, default False
If True, then force showing an error.
validate if we are doing a setitem on a chained copy.
It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some
blocks may be views while other are not. Currently _is_view will ALWAYS
return False for multi-blocks to avoid having to handle this case.
df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'
# This technically need not raise SettingWithCopy if both are view
# (which is not generally guaranteed but is usually True. However,
# this is in general not a good practice and we recommend using .loc.
df.iloc[0:5]['group'] = 'a'
"""
if using_copy_on_write() or warn_copy_on_write():
return
# return early if the check is not needed
if not (force or self._is_copy):
return
value = config.get_option("mode.chained_assignment")
if value is None:
return
# see if the copy is not actually referred; if so, then dissolve
# the copy weakref
if self._is_copy is not None and not isinstance(self._is_copy, str):
r = self._is_copy()
if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
self._is_copy = None
return
# a custom message
if isinstance(self._is_copy, str):
t = self._is_copy
elif t == "referent":
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
else:
t = (
"\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value "
"instead\n\nSee the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy"
)
if value == "raise":
raise SettingWithCopyError(t)
if value == "warn":
> warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
E pandas.errors.SettingWithCopyWarning:
E A value is trying to be set on a copy of a slice from a DataFrame.
E Try using .loc[row_indexer,col_indexer] = value instead
E
E See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\runneradmin\miniconda3\envs\dask-distributed\Lib\site-packages\pandas\core\generic.py:4472: SettingWithCopyWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_simple (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 1s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 1s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 1s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 1s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
tornado.httpclient.HTTPClientError: HTTP 500: Internal Server Error
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49352', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49353', name: 0, status: closed, stored: 0, running: 1/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49355', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True, scheduler_kwargs={"dashboard": True})
async def test_simple(c, s, a, b):
port = s.http_server.port
ev = Event()
future = c.submit(block_on_event, ev)
await asyncio.sleep(0.1)
http_client = AsyncHTTPClient()
for suffix in applications:
if suffix in blocklist_apps:
continue
> response = await http_client.fetch(f"http://localhost:{port}{suffix}")
E tornado.httpclient.HTTPClientError: HTTP 500: Internal Server Error
distributed\dashboard\tests\test_scheduler_bokeh.py:92: HTTPClientError
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_stealing_events (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49391', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49392', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49394', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_stealing_events(c, s, a, b):
> se = StealingEvents(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:142:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:1925: in __init__
self.root.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_events (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49403', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49404', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49406', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_events(c, s, a, b):
> e = Events(s, "all")
distributed\dashboard\tests\test_scheduler_bokeh.py:157:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:2019: in __init__
self.root.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_WorkerTable (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49604', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49605', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49607', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_WorkerTable(c, s, a, b):
> wt = WorkerTable(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:549:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:4201: in __init__
mem_plot.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_WorkerTable_custom_metrics (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49616', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49617', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49619', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_WorkerTable_custom_metrics(c, s, a, b):
def metric_port(worker):
return worker.port
def metric_address(worker):
return worker.address
metrics = {"metric_port": metric_port, "metric_address": metric_address}
for w in [a, b]:
for name, func in metrics.items():
w.metrics[name] = func
await asyncio.gather(a.heartbeat(), b.heartbeat())
for w in [a, b]:
assert s.workers[w.address].metrics["metric_port"] == w.port
assert s.workers[w.address].metrics["metric_address"] == w.address
> wt = WorkerTable(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:586:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:4201: in __init__
mem_plot.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_WorkerTable_different_metrics (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49630', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49631', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49633', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_WorkerTable_different_metrics(c, s, a, b):
def metric_port(worker):
return worker.port
a.metrics["metric_a"] = metric_port
b.metrics["metric_b"] = metric_port
await asyncio.gather(a.heartbeat(), b.heartbeat())
assert s.workers[a.address].metrics["metric_a"] == a.port
assert s.workers[b.address].metrics["metric_b"] == b.port
> wt = WorkerTable(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:612:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:4201: in __init__
mem_plot.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_WorkerTable_metrics_with_different_metric_2 (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49644', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49645', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49647', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_WorkerTable_metrics_with_different_metric_2(c, s, a, b):
def metric_port(worker):
return worker.port
a.metrics["metric_a"] = metric_port
await asyncio.gather(a.heartbeat(), b.heartbeat())
> wt = WorkerTable(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:633:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:4201: in __init__
mem_plot.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_WorkerTable_add_and_remove_metrics (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49658', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49659', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49661', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True, worker_kwargs={"metrics": {"my_port": lambda w: w.port}})
async def test_WorkerTable_add_and_remove_metrics(c, s, a, b):
def metric_port(worker):
return worker.port
a.metrics["metric_a"] = metric_port
b.metrics["metric_b"] = metric_port
await asyncio.gather(a.heartbeat(), b.heartbeat())
assert s.workers[a.address].metrics["metric_a"] == a.port
assert s.workers[b.address].metrics["metric_b"] == b.port
> wt = WorkerTable(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:656:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:4201: in __init__
mem_plot.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_WorkerTable_with_memory_limit_as_0 (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49692', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49693', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49695', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True, worker_kwargs={"memory_limit": 0})
async def test_WorkerTable_with_memory_limit_as_0(c, s, a, b):
> wt = WorkerTable(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:692:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:4201: in __init__
mem_plot.circle(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\glyph_api.py:144: in circle
deprecated((3, 4, 0), "circle() method with size value", "scatter(size=...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = "'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead."
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'circle() method with size value' was deprecated in Bokeh 3.4.0 and will be removed, use 'scatter(size=...) instead' instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_TaskGraph (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49748', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49749', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49751', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_TaskGraph(c, s, a, b):
> gp = TaskGraph(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:822:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:2305: in __init__
rect = self.root.square(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\_decorators.py:58: in wrapped
deprecated((3, 4, 0), f"{func.__name__}() method", f"scatter(marker={func.__name__!r}, ...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = '\'square() method\' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker=\'square\', ...) instead" instead.'
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_TaskGraph_clear (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49760', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49761', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49763', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_TaskGraph_clear(c, s, a, b):
> gp = TaskGraph(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:864:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:2305: in __init__
rect = self.root.square(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\_decorators.py:58: in wrapped
deprecated((3, 4, 0), f"{func.__name__}() method", f"scatter(marker={func.__name__!r}, ...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = '\'square() method\' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker=\'square\', ...) instead" instead.'
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_TaskGraph_limit (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49782', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49783', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49785', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True, config={"distributed.dashboard.graph-max-items": 2})
async def test_TaskGraph_limit(c, s, a, b):
> gp = TaskGraph(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:888:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:2305: in __init__
rect = self.root.square(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\_decorators.py:58: in wrapped
deprecated((3, 4, 0), f"{func.__name__}() method", f"scatter(marker={func.__name__!r}, ...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = '\'square() method\' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker=\'square\', ...) instead" instead.'
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_TaskGraph_complex (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49794', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49795', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49797', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_TaskGraph_complex(c, s, a, b):
da = pytest.importorskip("dask.array")
> gp = TaskGraph(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:911:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:2305: in __init__
rect = self.root.square(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\_decorators.py:58: in wrapped
deprecated((3, 4, 0), f"{func.__name__}() method", f"scatter(marker={func.__name__!r}, ...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = '\'square() method\' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker=\'square\', ...) instead" instead.'
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_TaskGraph_order (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49806', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49807', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49809', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_TaskGraph_order(c, s, a, b):
x = c.submit(inc, 1)
y = c.submit(div, 1, 0)
await wait(y)
> gp = TaskGraph(s)
distributed\dashboard\tests\test_scheduler_bokeh.py:945:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed\dashboard\components\scheduler.py:2305: in __init__
rect = self.root.square(
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\plotting\_decorators.py:58: in wrapped
deprecated((3, 4, 0), f"{func.__name__}() method", f"scatter(marker={func.__name__!r}, ...) instead")
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\deprecation.py:73: in deprecated
warn(message, BokehDeprecationWarning)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
message = '\'square() method\' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker=\'square\', ...) instead" instead.'
category = <class 'bokeh.util.warnings.BokehDeprecationWarning'>, stacklevel = 4
def warn(message: str, category: type[Warning] | None = None, stacklevel: int | None = None) -> None:
if stacklevel is None:
stacklevel = find_stack_level()
> warnings.warn(message, category, stacklevel=stacklevel)
E bokeh.util.warnings.BokehDeprecationWarning: 'square() method' was deprecated in Bokeh 3.4.0 and will be removed, use "scatter(marker='square', ...) instead" instead.
C:\Users\runneradmin\miniconda3\envs\dask-distributed\lib\site-packages\bokeh\util\warnings.py:64: BokehDeprecationWarning
Check warning on line 0 in distributed.dashboard.tests.test_scheduler_bokeh
github-actions / Unit Test Results
All 11 runs failed: test_https_support (distributed.dashboard.tests.test_scheduler_bokeh)
artifacts/macos-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-default-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_expr-notci1/pytest.xml [took 0s]
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.10-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.11-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.12-default-notci1/pytest.xml [took 0s]
artifacts/windows-latest-3.9-default-notci1/pytest.xml [took 0s]
Raw output
tornado.httpclient.HTTPClientError: HTTP 500: Internal Server Error
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:49914', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:49915', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:49917', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(
client=True,
scheduler_kwargs={"dashboard": True},
config={
"distributed.scheduler.dashboard.tls.key": get_cert("tls-key.pem"),
"distributed.scheduler.dashboard.tls.cert": get_cert("tls-cert.pem"),
"distributed.scheduler.dashboard.tls.ca-file": get_cert("tls-ca-cert.pem"),
},
)
async def test_https_support(c, s, a, b):
port = s.http_server.port
assert (
format_dashboard_link("localhost", port) == "https://localhost:%d/status" % port
)
ctx = ssl.create_default_context()
ctx.load_verify_locations(get_cert("tls-ca-cert.pem"))
http_client = AsyncHTTPClient()
response = await http_client.fetch(
"https://localhost:%d/individual-plots.json" % port, ssl_options=ctx
)
response = json.loads(response.body.decode())
for suffix in [
"system",
"counters",
"workers",
"status",
"tasks",
"stealing",
"graph",
] + [url.strip("/") for url in response.values()]:
req = HTTPRequest(
url="https://localhost:%d/%s" % (port, suffix), ssl_options=ctx
)
> response = await http_client.fetch(req)
E tornado.httpclient.HTTPClientError: HTTP 500: Internal Server Error
distributed\dashboard\tests\test_scheduler_bokeh.py:1165: HTTPClientError
Check warning on line 0 in distributed.shuffle.tests.test_shuffle
github-actions / Unit Test Results
1 out of 12 runs failed: test_barrier_handles_stale_resumed_transfer (distributed.shuffle.tests.test_shuffle)
artifacts/ubuntu-latest-3.9-no_queue-notci1/pytest.xml [took 31s]
Raw output
asyncio.exceptions.TimeoutError: Test timeout (30) hit after 30.00036597251892s.
========== Test stack trace starts here ==========
Stack for <Task pending name='Task-194568' coro=<test_barrier_handles_stale_resumed_transfer() running at /home/runner/work/distributed/distributed/distributed/shuffle/tests/test_shuffle.py:2707> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x75142fbe6f70>()]>> (most recent call last):
File "/home/runner/work/distributed/distributed/distributed/shuffle/tests/test_shuffle.py", line 2707, in test_barrier_handles_stale_resumed_transfer
await wait_for_state(key, "processing", s)
args = (), kwds = {}
@wraps(func)
def inner(*args, **kwds):
with self._recreate_cm():
> return func(*args, **kwds)
../../../miniconda3/envs/dask-distributed/lib/python3.9/contextlib.py:79:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../../../miniconda3/envs/dask-distributed/lib/python3.9/contextlib.py:79: in inner
return func(*args, **kwds)
distributed/utils_test.py:1102: in test_func
return _run_and_close_tornado(async_fn_outer)
distributed/utils_test.py:378: in _run_and_close_tornado
return asyncio_run(inner_fn(), loop_factory=get_loop_factory())
distributed/compatibility.py:236: in asyncio_run
return loop.run_until_complete(main)
../../../miniconda3/envs/dask-distributed/lib/python3.9/asyncio/base_events.py:647: in run_until_complete
return future.result()
distributed/utils_test.py:375: in inner_fn
return await async_fn(*args, **kwargs)
distributed/utils_test.py:1099: in async_fn_outer
return await utils_wait_for(async_fn(), timeout=timeout * 2)
distributed/utils.py:1940: in wait_for
return await asyncio.wait_for(fut, timeout)
../../../miniconda3/envs/dask-distributed/lib/python3.9/asyncio/tasks.py:479: in wait_for
return fut.result()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
async def async_fn():
result = None
with dask.config.set(config):
async with (
_cluster_factory() as (s, workers),
_client_factory(s) as c,
):
args = [s] + workers
if c is not None:
args = [c] + args
try:
coro = func(*args, *outer_args, **kwargs)
task = asyncio.create_task(coro)
coro2 = utils_wait_for(
asyncio.shield(task), timeout=deadline.remaining
)
result = await coro2
validate_state(s, *workers)
except asyncio.TimeoutError:
assert task
elapsed = deadline.elapsed
buffer = io.StringIO()
# This stack indicates where the coro/test is suspended
task.print_stack(file=buffer)
if cluster_dump_directory:
await dump_cluster_state(
s=s,
ws=workers,
output_dir=cluster_dump_directory,
func_name=func.__name__,
)
task.cancel()
while not task.cancelled():
await asyncio.sleep(0.01)
# Hopefully, the hang has been caused by inconsistent
# state, which should be much more meaningful than the
# timeout
validate_state(s, *workers)
# Remove as much of the traceback as possible; it's
# uninteresting boilerplate from utils_test and asyncio
# and not from the code being tested.
> raise asyncio.TimeoutError(
f"Test timeout ({timeout}) hit after {elapsed}s.\n"
"========== Test stack trace starts here ==========\n"
f"{buffer.getvalue()}"
) from None
E asyncio.exceptions.TimeoutError: Test timeout (30) hit after 30.00036597251892s.
E ========== Test stack trace starts here ==========
E Stack for <Task pending name='Task-194568' coro=<test_barrier_handles_stale_resumed_transfer() running at /home/runner/work/distributed/distributed/distributed/shuffle/tests/test_shuffle.py:2707> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x75142fbe6f70>()]>> (most recent call last):
E File "/home/runner/work/distributed/distributed/distributed/shuffle/tests/test_shuffle.py", line 2707, in test_barrier_handles_stale_resumed_transfer
E await wait_for_state(key, "processing", s)
distributed/utils_test.py:1041: TimeoutError
Check notice on line 0 in .github
github-actions / Unit Test Results
109 skipped tests found
There are 109 skipped tests, see "Raw output" for the full list of skipped tests.
Raw output
distributed.cli.tests.test_dask_scheduler
distributed.cli.tests.test_dask_ssh
distributed.cli.tests.test_dask_worker ‑ test_listen_address_ipv6[tcp://:---nanny]
distributed.cli.tests.test_dask_worker ‑ test_listen_address_ipv6[tcp://:---no-nanny]
distributed.cli.tests.test_dask_worker ‑ test_listen_address_ipv6[tcp://[::1]:---nanny]
distributed.cli.tests.test_dask_worker ‑ test_listen_address_ipv6[tcp://[::1]:---no-nanny]
distributed.comm.tests.test_comms ‑ test_default_client_server_ipv6[tornado]
distributed.comm.tests.test_comms ‑ test_tcp_client_server_ipv6[tornado]
distributed.comm.tests.test_comms ‑ test_tls_client_server_ipv6[tornado]
distributed.comm.tests.test_comms ‑ test_ucx_client_server
distributed.comm.tests.test_ucx
distributed.comm.tests.test_ucx_config
distributed.dashboard.tests.test_components
distributed.dashboard.tests.test_scheduler_bokeh
distributed.dashboard.tests.test_worker_bokeh
distributed.deploy.tests.test_adaptive ‑ test_adaptive_scale_down_override
distributed.deploy.tests.test_old_ssh
distributed.deploy.tests.test_ssh
distributed.diagnostics.tests.test_cudf_diagnostics
distributed.diagnostics.tests.test_memory_sampler ‑ test_pandas[False]
distributed.diagnostics.tests.test_memory_sampler ‑ test_pandas[True]
distributed.diagnostics.tests.test_memray
distributed.diagnostics.tests.test_nvml
distributed.diagnostics.tests.test_nvml ‑ test_1_visible_devices
distributed.diagnostics.tests.test_nvml ‑ test_2_visible_devices[0,1]
distributed.diagnostics.tests.test_nvml ‑ test_2_visible_devices[1,0]
distributed.diagnostics.tests.test_nvml ‑ test_gpu_metrics
distributed.diagnostics.tests.test_nvml ‑ test_gpu_monitoring_range_query
distributed.diagnostics.tests.test_nvml ‑ test_gpu_monitoring_recent
distributed.diagnostics.tests.test_nvml ‑ test_has_cuda_context
distributed.diagnostics.tests.test_nvml ‑ test_one_time
distributed.diagnostics.tests.test_progress_stream
distributed.diagnostics.tests.test_progress_widgets
distributed.diagnostics.tests.test_rmm_diagnostics
distributed.protocol.tests.test_arrow
distributed.protocol.tests.test_collection
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_cupy[50-cuda-dict]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_cupy[50-cuda-tuple]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_cupy[None-pickle-dict]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_cupy[None-pickle-tuple]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_pandas_pandas[None-pickle-dict]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_pandas_pandas[None-pickle-tuple]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_pandas_pandas[df20-cuda-dict]
distributed.protocol.tests.test_collection_cuda ‑ test_serialize_pandas_pandas[df20-cuda-tuple]
distributed.protocol.tests.test_cupy
distributed.protocol.tests.test_h5py
distributed.protocol.tests.test_highlevelgraph
distributed.protocol.tests.test_keras
distributed.protocol.tests.test_netcdf4
distributed.protocol.tests.test_numba
distributed.protocol.tests.test_numpy
distributed.protocol.tests.test_numpy ‑ test_dumps_serialize_numpy_custom_dtype
distributed.protocol.tests.test_pandas
distributed.protocol.tests.test_rmm
distributed.protocol.tests.test_scipy
distributed.protocol.tests.test_serialize ‑ test_check_dask_serializable[data7-True]
distributed.protocol.tests.test_sparse
distributed.protocol.tests.test_torch
distributed.shuffle.tests.test_graph
distributed.shuffle.tests.test_graph ‑ test_raise_on_custom_objects
distributed.shuffle.tests.test_merge
distributed.shuffle.tests.test_merge_column_and_index
distributed.shuffle.tests.test_metrics
distributed.shuffle.tests.test_rechunk
distributed.shuffle.tests.test_shuffle
distributed.shuffle.tests.test_shuffle ‑ test_basic_cudf_support
distributed.shuffle.tests.test_shuffle_plugins
distributed.tests.test_actor ‑ test_linear_access
distributed.tests.test_client ‑ test_annotations_survive_optimization
distributed.tests.test_client ‑ test_badly_serialized_input_stderr
distributed.tests.test_client ‑ test_balance_tasks_by_stacks
distributed.tests.test_client ‑ test_client_repr_closed_sync
distributed.tests.test_client ‑ test_contiguous_load
distributed.tests.test_client ‑ test_dont_delete_recomputed_results
distributed.tests.test_client ‑ test_dont_hold_on_to_large_messages
distributed.tests.test_client ‑ test_interleave_computations_map
distributed.tests.test_client ‑ test_multiple_clients
distributed.tests.test_config ‑ test_uvloop_event_loop
distributed.tests.test_counter ‑ test_digest[None-<lambda>]
distributed.tests.test_dask_collections
distributed.tests.test_dask_collections ‑ test_sparse_arrays
distributed.tests.test_jupyter
distributed.tests.test_nanny ‑ test_nanny_closed_by_keyboard_interrupt[tcp]
distributed.tests.test_nanny ‑ test_nanny_closed_by_keyboard_interrupt[ucx]
distributed.tests.test_preload ‑ test_client_preload_click
distributed.tests.test_preload ‑ test_client_preload_text
distributed.tests.test_profile ‑ test_basic_low_level
distributed.tests.test_resources ‑ test_balance_resources
distributed.tests.test_resources ‑ test_collections_get[True]
distributed.tests.test_resources ‑ test_dont_optimize_out
distributed.tests.test_resources ‑ test_full_collections
distributed.tests.test_scheduler ‑ test_rebalance_raises_missing_data3[True]
distributed.tests.test_steal ‑ test_correct_bad_time_estimate
distributed.tests.test_steal ‑ test_steal_related_tasks
distributed.tests.test_stress ‑ test_no_delay_during_large_transfer
distributed.tests.test_stress ‑ test_stress_steal
distributed.tests.test_utils_perf ‑ test_gc_diagnosis_rss_win
distributed.tests.test_utils_test ‑ test_gen_cluster_cleans_up_client
distributed.tests.test_utils_test ‑ test_gen_test
distributed.tests.test_utils_test ‑ test_gen_test_legacy_explicit
distributed.tests.test_utils_test ‑ test_gen_test_legacy_implicit
distributed.tests.test_worker ‑ test_dont_overlap_communications_to_same_worker
distributed.tests.test_worker ‑ test_get_client_coroutine_sync
distributed.tests.test_worker ‑ test_protocol_from_scheduler_address[Nanny]
distributed.tests.test_worker ‑ test_protocol_from_scheduler_address[Worker]
distributed.tests.test_worker ‑ test_share_communication
distributed.tests.test_worker ‑ test_upload_file_pyc
distributed.tests.test_worker ‑ test_upload_large_file
distributed.tests.test_worker ‑ test_wait_for_outgoing