Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add size method to TSDataset class #238

Merged
merged 6 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1651,3 +1651,18 @@
ts_samples = [samples for df_segment in ts_segments for samples in make_samples(df_segment)]

return _TorchDataset(ts_samples=ts_samples)

def size(self) -> Tuple[int, int, Optional[int]]:
"""Return size of TSDataset.
Returns
-------
:
Number of time series, number of segments, and number of features (if their amounts are equal in each segment; otherwise, returns None)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, we should write this info before "Returns", something like this:

def size(self) -> Tuple[int, int, Optional[int]]:
        """Return size of TSDataset.

        The order of sizes is (number of time series, number of segments, 
        and number of features (if their amounts are equal in each segment; otherwise, returns None)).
        
        Returns
        -------
        :
            Tuple of TSDataset sizes 
        """

"""
allfeatures = 0
for segment in self.segments:
cur_seg_features = self.df[segment].columns.get_level_values("feature").unique()
if allfeatures != 0 and allfeatures != len(cur_seg_features):
return len(self.index), len(self.segments), None
allfeatures = len(cur_seg_features)
return len(self.index), len(self.segments), allfeatures

Check warning on line 1668 in etna/datasets/tsdataset.py

View check run for this annotation

Codecov / codecov/patch

etna/datasets/tsdataset.py#L1662-L1668

Added lines #L1662 - L1668 were not covered by tests
37 changes: 37 additions & 0 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,43 @@ def test_dataset_segment_conversion_during_init(df_segments_int):
assert np.all(ts.columns.get_level_values("segment") == ["1", "2"])


def test_size_with_diff_number_of_features():
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-02-01", "2021-07-01", freq="1d")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-02-01", "2021-07-01", freq="1d")})
df_1["segment"] = "Moscow"
df_1["Feature"] = "Feature"
df_1["target"] = [x**2 + np.random.uniform(-2, 2) for x in list(range(len(df_1)))]
df_2["segment"] = "Omsk"
df_2["target"] = [x**2 + np.random.uniform(-2, 2) for x in list(range(len(df_1)))]
tdf_1 = TSDataset.to_dataset(df_1)
tdf_2 = TSDataset.to_dataset(df_2)
tdf = TSDataset(df=tdf_1, df_exog=tdf_2, freq="1d")
assert tdf.size()[0] == len(df_1)
assert tdf.size()[1] == 2
assert tdf.size()[2] is None


def test_size_target_only():
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-02-01", "2021-07-01", freq="1d")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-02-01", "2021-07-01", freq="1d")})
df_1["segment"] = "Moscow"
df_1["target"] = [x**2 + np.random.uniform(-2, 2) for x in list(range(len(df_1)))]
df_2["segment"] = "Omsk"
df_2["target"] = [x**2 + np.random.uniform(-2, 2) for x in list(range(len(df_1)))]
tdf_1 = TSDataset.to_dataset(df_1)
tdf_2 = TSDataset.to_dataset(df_2)
tdf = TSDataset(df=tdf_1, df_exog=tdf_2, freq="1d")
assert tdf.size()[0] == len(df_1)
assert tdf.size()[1] == 2
assert tdf.size()[2] == 1


def simple_test_size_(tsdf_with_exog):
assert tsdf_with_exog.size()[0] == 151
assert tsdf_with_exog.size()[1] == 2
assert tsdf_with_exog.size()[2] == 2


@pytest.mark.xfail
def test_make_future_raise_error_on_diff_endings(ts_diff_endings):
with pytest.raises(ValueError, match="All segments should end at the same timestamp"):
Expand Down