/
pandas_extensions.py
1725 lines (1486 loc) · 63.7 KB
/
pandas_extensions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import abc
import csv
import dataclasses
from collections.abc import Hashable
from datetime import datetime
from io import BufferedReader, BytesIO, StringIO
from pathlib import Path
from typing import Any, Callable, Collection, Dict, Iterator, List, Optional, Tuple, Type, Union
try:
import pandas as pd
except ImportError:
raise NotImplementedError("Pandas is not installed.")
from typing import Literal
try:
from collections.abc import Iterable, Mapping, Sequence
except ImportError:
from collections import Iterable, Mapping, Sequence
try:
import fsspec
import pyarrow.fs
FILESYSTEM_TYPE = Optional[Union[pyarrow.fs.FileSystem, fsspec.spec.AbstractFileSystem]]
except ImportError:
FILESYSTEM_TYPE = Optional[Type]
from sqlite3 import Connection
from pandas._typing import NpDtype
from pandas.core.dtypes.dtypes import ExtensionDtype
from hamilton import registry
from hamilton.io import utils
from hamilton.io.data_adapters import DataLoader, DataSaver
DATAFRAME_TYPE = pd.DataFrame
COLUMN_TYPE = pd.Series
JSONSerializable = Optional[Union[str, float, bool, List, Dict]]
IndexLabel = Optional[Union[Hashable, Iterator[Hashable]]]
Dtype = Union[ExtensionDtype, NpDtype]
@registry.get_column.register(pd.DataFrame)
def get_column_pandas(df: pd.DataFrame, column_name: str) -> pd.Series:
return df[column_name]
@registry.fill_with_scalar.register(pd.DataFrame)
def fill_with_scalar_pandas(df: pd.DataFrame, column_name: str, value: Any) -> pd.DataFrame:
df[column_name] = value
return df
def register_types():
"""Function to register the types for this extension."""
registry.register_types("pandas", DATAFRAME_TYPE, COLUMN_TYPE)
register_types()
class DataFrameDataLoader(DataLoader, DataSaver, abc.ABC):
"""Base class for data loaders that saves/loads pandas dataframes.
Note that these are currently grouped together, but this could change!
We can change this as these are not part of the publicly exposed APIs.
Rather, the fixed component is the keys (E.G. csv, feather, etc...) , which,
when combined with types, correspond to a group of specific parameter. As such,
the backwards-compatible invariance enables us to change the implementation
(which classes), and so long as the set of parameters/load targets are compatible,
we are good to go."""
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
@abc.abstractmethod
def load_data(self, type_: Type[DATAFRAME_TYPE]) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
pass
@abc.abstractmethod
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
pass
@dataclasses.dataclass
class PandasCSVReader(DataLoader):
"""
Class that handles saving CSV files with pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
"""
# the filepath_or_buffer param will be changed to path for backwards compatibility
path: Union[str, Path, BytesIO, BufferedReader]
# kwargs
sep: Union[str, None] = ","
delimiter: Optional[str] = None
header: Union[Sequence, int, Literal["infer"], None] = "infer"
names: Optional[Sequence] = None
index_col: Optional[Union[Hashable, Sequence, Literal[False]]] = None
usecols: Optional[Union[List[Hashable], Callable, tuple]] = None
dtype: Optional[Union[Dtype, Dict[Hashable, Dtype]]] = None
engine: Optional[Literal["c", "python", "pyarrow", "python-fwf"]] = None
converters: Optional[Mapping] = None
true_values: Optional[List] = None
false_values: Optional[List] = None
skipinitialspace: Optional[bool] = False
skiprows: Optional[Union[List[int], int, Callable[[Hashable], bool]]] = None
skipfooter: int = 0
nrows: Optional[int] = None
na_values: Optional[Union[Hashable, Iterable, Mapping]] = None
keep_default_na: bool = True
na_filter: bool = True
verbose: bool = False
skip_blank_lines: bool = True
parse_dates: Optional[Union[bool, Sequence, None]] = False
keep_date_col: bool = False
date_format: Optional[str] = None
dayfirst: bool = False
cache_dates: bool = True
iterator: bool = False
chunksize: Optional[int] = None
compression: Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], Dict[str, Any]]
] = "infer"
thousands: Optional[str] = None
decimal: str = "."
lineterminator: Optional[str] = None
quotechar: Optional[str] = None
quoting: int = 0
doublequote: bool = True
escapechar: Optional[str] = None
comment: Optional[str] = None
encoding: str = "utf-8"
encoding_errors: Union[
Literal["strict", "ignore", "replace", "backslashreplace", "surrogateescape"],
str,
] = "strict"
dialect: Optional[Union[str, csv.Dialect]] = None
on_bad_lines: Union[Literal["error", "warn", "skip"], Callable] = "error"
delim_whitespace: bool = False
low_memory: bool = True
memory_map: bool = False
float_precision: Optional[Literal["high", "legacy", "round_trip"]] = None
storage_options: Optional[Dict[str, Any]] = None
dtype_backend: Literal["pyarrow", "numpy_nullable"] = "numpy_nullable"
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self) -> Dict[str, Any]:
kwargs = {}
if self.sep is not None:
kwargs["sep"] = self.sep
if self.delimiter is not None:
kwargs["delimiter"] = self.delimiter
if self.header is not None:
kwargs["header"] = self.header
if self.names is not None:
kwargs["names"] = self.names
if self.index_col is not None:
kwargs["index_col"] = self.index_col
if self.usecols is not None:
kwargs["usecols"] = self.usecols
if self.dtype is not None:
kwargs["dtype"] = self.dtype
if self.engine is not None:
kwargs["engine"] = self.engine
if self.converters is not None:
kwargs["converters"] = self.converters
if self.true_values is not None:
kwargs["true_values"] = self.true_values
if self.false_values is not None:
kwargs["false_values"] = self.false_values
if self.skipinitialspace is not None:
kwargs["skipinitialspace"] = self.skipinitialspace
if self.skiprows is not None:
kwargs["skiprows"] = self.skiprows
if self.nrows is not None:
kwargs["nrows"] = self.nrows
if self.na_values is not None:
kwargs["na_values"] = self.na_values
if self.keep_default_na is not None:
kwargs["keep_default_na"] = self.keep_default_na
if self.na_filter is not None:
kwargs["na_filter"] = self.na_filter
if self.verbose is not None:
kwargs["verbose"] = self.verbose
if self.skip_blank_lines is not None:
kwargs["skip_blank_lines"] = self.skip_blank_lines
if self.parse_dates is not None:
kwargs["parse_dates"] = self.parse_dates
if self.keep_date_col is not None:
kwargs["keep_date_col"] = self.keep_date_col
if self.date_format is not None:
kwargs["date_format"] = self.date_format
if self.dayfirst is not None:
kwargs["dayfirst"] = self.dayfirst
if self.cache_dates is not None:
kwargs["cache_dates"] = self.cache_dates
if self.iterator is not None:
kwargs["iterator"] = self.iterator
if self.chunksize is not None:
kwargs["chunksize"] = self.chunksize
if self.compression is not None:
kwargs["compression"] = self.compression
if self.thousands is not None:
kwargs["thousands"] = self.thousands
if self.lineterminator is not None:
kwargs["lineterminator"] = self.lineterminator
if self.quotechar is not None:
kwargs["quotechar"] = self.quotechar
if self.quoting is not None:
kwargs["quoting"] = self.quoting
if self.doublequote is not None:
kwargs["doublequote"] = self.doublequote
if self.escapechar is not None:
kwargs["escapechar"] = self.escapechar
if self.comment is not None:
kwargs["comment"] = self.comment
if self.encoding is not None:
kwargs["encoding"] = self.encoding
if self.encoding_errors is not None:
kwargs["encoding_errors"] = self.encoding_errors
if self.dialect is not None:
kwargs["dialect"] = self.dialect
if self.on_bad_lines is not None:
kwargs["on_bad_lines"] = self.on_bad_lines
if self.delim_whitespace is not None:
kwargs["delim_whitespace"] = self.delim_whitespace
if self.low_memory is not None:
kwargs["low_memory"] = self.low_memory
if self.memory_map is not None:
kwargs["memory_map"] = self.memory_map
if self.float_precision is not None:
kwargs["float_precision"] = self.float_precision
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
if pd.__version__ >= "2.0" and self.dtype_backend is not None:
kwargs["dtype_backend"] = self.dtype_backend
return kwargs
def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
df = pd.read_csv(self.path, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.path, df)
return df, metadata
@classmethod
def name(cls) -> str:
return "csv"
@dataclasses.dataclass
class PandasCSVWriter(DataSaver):
"""Class that handles saving CSV files with pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
"""
path: Union[str, Path, BytesIO, BufferedReader]
# kwargs
sep: Union[str, None] = ","
na_rep: str = ""
float_format: Optional[Union[str, Callable]] = None
columns: Optional[Sequence] = None
header: Optional[Union[bool, List[str]]] = True
index: Optional[bool] = False
index_label: Optional[IndexLabel] = None
mode: str = "w"
encoding: Optional[str] = None
compression: Optional[
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], Dict[str, Any]]
] = "infer"
quoting: Optional[int] = None
quotechar: Optional[str] = '"'
lineterminator: Optional[str] = None
chunksize: Optional[int] = None
date_format: Optional[str] = None
doublequote: bool = True
escapechar: Optional[str] = None
decimal: str = "."
errors: str = "strict"
storage_options: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_saving_kwargs(self) -> Dict[str, Any]:
# Puts kwargs in a dict
kwargs = {}
if self.sep is not None:
kwargs["sep"] = self.sep
if self.na_rep is not None:
kwargs["na_rep"] = self.na_rep
if self.float_format is not None:
kwargs["float_format"] = self.float_format
if self.columns is not None:
kwargs["columns"] = self.columns
if self.header is not None:
kwargs["header"] = self.header
if self.index is not None:
kwargs["index"] = self.index
if self.index_label is not None:
kwargs["index_label"] = self.index_label
if self.mode is not None:
kwargs["mode"] = self.mode
if self.encoding is not None:
kwargs["encoding"] = self.encoding
if self.compression is not None:
kwargs["compression"] = self.compression
if self.quoting is not None:
kwargs["quoting"] = self.quoting
if self.quotechar is not None:
kwargs["quotechar"] = self.quotechar
if self.lineterminator is not None:
kwargs["lineterminator"] = self.lineterminator
if self.chunksize is not None:
kwargs["chunksize"] = self.chunksize
if self.date_format is not None:
kwargs["date_format"] = self.date_format
if self.doublequote is not None:
kwargs["doublequote"] = self.doublequote
if self.escapechar is not None:
kwargs["escapechar"] = self.escapechar
if self.decimal is not None:
kwargs["decimal"] = self.decimal
if self.errors is not None:
kwargs["errors"] = self.errors
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
return kwargs
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
data.to_csv(self.path, **self._get_saving_kwargs())
return utils.get_file_and_dataframe_metadata(self.path, data)
@classmethod
def name(cls) -> str:
return "csv"
@dataclasses.dataclass
class PandasParquetReader(DataLoader):
"""Class that handles saving parquet files with pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html#pandas.read_parquet
"""
path: Union[str, Path, BytesIO, BufferedReader]
# kwargs
engine: Literal["auto", "pyarrow", "fastparquet"] = "auto"
columns: Optional[List[str]] = None
storage_options: Optional[Dict[str, Any]] = None
use_nullable_dtypes: bool = False
dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable"
filesystem: Optional[str] = None
filters: Optional[Union[List[Tuple], List[List[Tuple]]]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self):
kwargs = {}
if self.engine is not None:
kwargs["engine"] = self.engine
if self.columns is not None:
kwargs["columns"] = self.columns
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
if pd.__version__ < "2.0" and self.use_nullable_dtypes is not None:
kwargs["use_nullable_dtypes"] = self.use_nullable_dtypes
if pd.__version__ >= "2.0" and self.dtype_backend is not None:
kwargs["dtype_backend"] = self.dtype_backend
if self.filesystem is not None:
kwargs["filesystem"] = self.filesystem
if self.filters is not None:
kwargs["filters"] = self.filters
return kwargs
def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
# Loads the data and returns the df and metadata of the pickle
df = pd.read_parquet(self.path, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.path, df)
return df, metadata
@classmethod
def name(cls) -> str:
return "parquet"
@dataclasses.dataclass
class PandasParquetWriter(DataSaver):
"""Class that handles saving parquet files with pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet
"""
path: Union[str, Path, BytesIO, BufferedReader]
# kwargs
engine: Literal["auto", "pyarrow", "fastparquet"] = "auto"
compression: Optional[str] = "snappy"
index: Optional[bool] = None
partition_cols: Optional[List[str]] = None
storage_options: Optional[Dict[str, Any]] = None
extra_kwargs: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_saving_kwargs(self) -> Dict[str, Any]:
# Puts kwargs in a dict
kwargs = {}
if self.engine is not None:
kwargs["engine"] = self.engine
if self.compression is not None:
kwargs["compression"] = self.compression
if self.index is not None:
kwargs["index"] = self.index
if self.partition_cols is not None:
kwargs["partition_cols"] = self.partition_cols
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
if self.extra_kwargs is not None:
kwargs.update(self.extra_kwargs)
return kwargs
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
data.to_parquet(self.path, **self._get_saving_kwargs())
return utils.get_file_and_dataframe_metadata(self.path, data)
@classmethod
def name(cls) -> str:
return "parquet"
@dataclasses.dataclass
class PandasPickleReader(DataLoader):
"""Class for loading/reading pickle files with Pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_pickle.html#pandas.read_pickle
"""
filepath_or_buffer: Union[str, Path, BytesIO, BufferedReader] = None
path: Union[str, Path, BytesIO, BufferedReader] = (
None # alias for `filepath_or_buffer` to keep reading/writing args symmetric.
)
# kwargs:
compression: Union[str, Dict[str, Any], None] = "infer"
storage_options: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
# Returns type for which data loader is available
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self) -> Dict[str, Any]:
# Puts kwargs in a dict
kwargs = {}
if self.compression is not None:
kwargs["compression"] = self.compression
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
return kwargs
def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
# Loads the data and returns the df and metadata of the pickle
df = pd.read_pickle(self.filepath_or_buffer, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.filepath_or_buffer, df)
return df, metadata
@classmethod
def name(cls) -> str:
return "pickle"
def __post_init__(self):
"""As we're adding in a path alias for filepath_or_buffer, we need to ensure that
we have backwards compatibility with the old parameter. That means that:
1. Either filepath_or_buffer or path must be specified, not both
2. If path is specified, filepath_or_buffer is set to path
"""
if self.filepath_or_buffer is None and self.path is None:
raise ValueError("Either filepath_or_buffer or path must be specified")
elif self.filepath_or_buffer is not None and self.path is not None:
raise ValueError("Only one of filepath_or_buffer or path must be specified")
elif self.filepath_or_buffer is None:
self.filepath_or_buffer = self.path
pickle_protocol_default = 5
@dataclasses.dataclass
class PandasPickleWriter(DataSaver):
"""Class that handles saving pickle files with pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html#pandas.DataFrame.to_pickle
"""
path: Union[str, Path, BytesIO, BufferedReader]
# kwargs:
compression: Union[str, Dict[str, Any], None] = "infer"
protocol: int = pickle_protocol_default
storage_options: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_saving_kwargs(self) -> Dict[str, Any]:
# Puts kwargs in a dict
kwargs = {}
if self.compression is not None:
kwargs["compression"] = self.compression
if self.protocol is not None:
kwargs["protocol"] = self.protocol
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
return kwargs
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
data.to_pickle(self.path, **self._get_saving_kwargs())
return utils.get_file_and_dataframe_metadata(self.path, data)
@classmethod
def name(cls) -> str:
return "pickle"
@dataclasses.dataclass
class PandasJsonReader(DataLoader):
"""Class specifically to handle loading JSON files/buffers with Pandas.
Disclaimer: We're exposing all the *current* params from the Pandas read_json method.
Some of these params may get deprecated or new params may be introduced. In the event that
the params/kwargs below become outdated, please raise an issue or submit a pull request.
Should map to https://pandas.pydata.org/docs/reference/api/pandas.read_json.html
"""
filepath_or_buffer: Union[str, Path, BytesIO, BufferedReader]
# kwargs
chunksize: Optional[int] = None
compression: Optional[Union[str, Dict[str, Any]]] = "infer"
convert_axes: Optional[bool] = None
convert_dates: Union[bool, List[str]] = True
date_unit: Optional[str] = None
dtype: Optional[Union[Dtype, Dict[Hashable, Dtype]]] = None
dtype_backend: Optional[str] = None
encoding: Optional[str] = None
encoding_errors: Optional[str] = "strict"
engine: str = "ujson"
keep_default_dates: bool = True
lines: bool = False
nrows: Optional[int] = None
orient: Optional[str] = None
precise_float: bool = False
storage_options: Optional[Dict[str, Any]] = None
typ: str = "frame"
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self) -> Dict[str, Any]:
kwargs = {}
if self.chunksize is not None:
kwargs["chunksize"] = self.chunksize
if self.compression is not None:
kwargs["compression"] = self.compression
if self.convert_axes is not None:
kwargs["convert_axes"] = self.convert_axes
if self.convert_dates is not None:
kwargs["convert_dates"] = self.convert_dates
if self.date_unit is not None:
kwargs["date_unit"] = self.date_unit
if self.dtype is not None:
kwargs["dtype"] = self.dtype
if pd.__version__ >= "2.0" and self.dtype_backend is not None:
kwargs["dtype_backend"] = self.dtype_backend
if self.encoding is not None:
kwargs["encoding"] = self.encoding
if self.encoding_errors is not None:
kwargs["encoding_errors"] = self.encoding_errors
if self.engine is not None:
kwargs["engine"] = self.engine
if self.keep_default_dates is not None:
kwargs["keep_default_dates"] = self.keep_default_dates
if self.lines is not None:
kwargs["lines"] = self.lines
if self.nrows is not None:
kwargs["nrows"] = self.nrows
if self.orient is not None:
kwargs["orient"] = self.orient
if self.precise_float is not None:
kwargs["precise_float"] = self.precise_float
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
if self.typ is not None:
kwargs["typ"] = self.typ
return kwargs
def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
df = pd.read_json(self.filepath_or_buffer, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.filepath_or_buffer, df)
return df, metadata
@classmethod
def name(cls) -> str:
return "json"
@dataclasses.dataclass
class PandasJsonWriter(DataSaver):
"""Class specifically to handle saving JSON files/buffers with Pandas.
Disclaimer: We're exposing all the *current* params from the Pandas DataFrame.to_json method.
Some of these params may get deprecated or new params may be introduced. In the event that
the params/kwargs below become outdated, please raise an issue or submit a pull request.
Should map to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html
"""
filepath_or_buffer: Union[str, Path, BytesIO, BufferedReader]
# kwargs
compression: str = "infer"
date_format: str = "epoch"
date_unit: str = "ms"
default_handler: Optional[Callable[[Any], JSONSerializable]] = None
double_precision: int = 10
force_ascii: bool = True
index: Optional[bool] = None
indent: int = 0
lines: bool = False
mode: str = "w"
orient: Optional[str] = None
storage_options: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_saving_kwargs(self):
kwargs = {}
if self.compression is not None:
kwargs["compression"] = self.compression
if self.date_format is not None:
kwargs["date_format"] = self.date_format
if self.date_unit is not None:
kwargs["date_unit"] = self.date_unit
if self.default_handler is not None:
kwargs["default_handler"] = self.default_handler
if self.double_precision is not None:
kwargs["double_precision"] = self.double_precision
if self.force_ascii is not None:
kwargs["force_ascii"] = self.force_ascii
if self.index is not None:
kwargs["index"] = self.index
if self.indent is not None:
kwargs["indent"] = self.indent
if self.lines is not False:
kwargs["lines"] = self.lines
if self.mode is not None:
kwargs["mode"] = self.mode
if self.orient is not None:
kwargs["orient"] = self.orient
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
return kwargs
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
data.to_json(self.filepath_or_buffer, **self._get_saving_kwargs())
return utils.get_file_and_dataframe_metadata(self.filepath_or_buffer, data)
@classmethod
def name(cls) -> str:
return "json"
@dataclasses.dataclass
class PandasSqlReader(DataLoader):
"""Class specifically to handle loading SQL data using Pandas.
Disclaimer: We're exposing all the *current* params from the Pandas read_sql method.
Some of these params may get deprecated or new params may be introduced. In the event that
the params/kwargs below become outdated, please raise an issue or submit a pull request.
Should map to https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html
Requires optional Pandas dependencies. See https://pandas.pydata.org/docs/getting_started/install.html#sql-databases.
"""
query_or_table: str
db_connection: Union[str, Connection] # can pass in SQLAlchemy engine/connection
# kwarg
chunksize: Optional[int] = None
coerce_float: bool = True
columns: Optional[List[str]] = None
dtype: Optional[Union[Dtype, Dict[Hashable, Dtype]]] = None
dtype_backend: Optional[str] = None
index_col: Optional[Union[str, List[str]]] = None
params: Optional[Union[List, Tuple, Dict]] = None
parse_dates: Optional[Union[List, Dict]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self) -> Dict[str, Any]:
kwargs = {}
if self.chunksize is not None:
kwargs["chunksize"] = self.chunksize
if self.coerce_float is not None:
kwargs["coerce_float"] = self.coerce_float
if self.columns is not None:
kwargs["columns"] = self.columns
if self.dtype is not None:
kwargs["dtype"] = self.dtype
if pd.__version__ >= "2.0" and self.dtype_backend is not None:
kwargs["dtype_backend"] = self.dtype_backend
if self.index_col is not None:
kwargs["index_col"] = self.index_col
if self.params is not None:
kwargs["params"] = self.params
if self.parse_dates is not None:
kwargs["parse_dates"] = self.parse_dates
return kwargs
def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
df = pd.read_sql(self.query_or_table, self.db_connection, **self._get_loading_kwargs())
sql_metadata = utils.get_sql_metadata(self.query_or_table, df)
df_metadata = utils.get_dataframe_metadata(df)
return df, {**sql_metadata, **df_metadata}
@classmethod
def name(cls) -> str:
return "sql"
@dataclasses.dataclass
class PandasSqlWriter(DataSaver):
"""Class specifically to handle saving DataFrames to SQL databases using Pandas.
Disclaimer: We're exposing all the *current* params from the Pandas DataFrame.to_sql method.
Some of these params may get deprecated or new params may be introduced. In the event that
the params/kwargs below become outdated, please raise an issue or submit a pull request.
Should map to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html
Requires optional Pandas dependencies. See https://pandas.pydata.org/docs/getting_started/install.html#sql-databases.
"""
table_name: str
db_connection: Any # can pass in SQLAlchemy engine/connection
# kwargs
chunksize: Optional[int] = None
dtype: Optional[Union[Dtype, Dict[Hashable, Dtype]]] = None
if_exists: str = "fail"
index: bool = True
index_label: Optional[IndexLabel] = None
method: Optional[Union[str, Callable]] = None
schema: Optional[str] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_saving_kwargs(self) -> Dict[str, Any]:
kwargs = {}
if self.chunksize is not None:
kwargs["chunksize"] = self.chunksize
if self.dtype is not None:
kwargs["dtype"] = self.dtype
if self.if_exists is not None:
kwargs["if_exists"] = self.if_exists
if self.index is not None:
kwargs["index"] = self.index
if self.index_label is not None:
kwargs["index_label"] = self.index_label
if self.method is not None:
kwargs["method"] = self.method
if self.schema is not None:
kwargs["schema"] = self.schema
return kwargs
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
results = data.to_sql(self.table_name, self.db_connection, **self._get_saving_kwargs())
sql_metadata = utils.get_sql_metadata(self.table_name, results)
df_metadata = utils.get_dataframe_metadata(data)
return {**sql_metadata, **df_metadata}
@classmethod
def name(cls) -> str:
return "sql"
@dataclasses.dataclass
class PandasXmlReader(DataLoader):
"""Class for loading/reading xml files with Pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_xml.html
Requires `lxml`. See https://pandas.pydata.org/docs/getting_started/install.html#xml
"""
path_or_buffer: Union[str, Path, BytesIO, BufferedReader]
# kwargs
xpath: Optional[str] = "./*"
namespace: Optional[Dict[str, str]] = None
elems_only: Optional[bool] = False
attrs_only: Optional[bool] = False
names: Optional[List[str]] = None
dtype: Optional[Dict[str, Any]] = None
converters: Optional[Dict[Union[int, str], Any]] = None
parse_dates: Union[bool, List[Union[int, str, List[List], Dict[str, List[int]]]]] = False
encoding: Optional[str] = "utf-8"
parser: str = "lxml"
stylesheet: Union[str, Path, BytesIO, BufferedReader] = None
iterparse: Optional[Dict[str, List[str]]] = None
compression: Union[str, Dict[str, Any], None] = "infer"
storage_options: Optional[Dict[str, Any]] = None
dtype_backend: str = "numpy_nullable"
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self) -> Dict[str, Any]:
kwargs = {}
if self.xpath is not None:
kwargs["xpath"] = self.xpath
if self.namespace is not None:
kwargs["namespace"] = self.namespace
if self.elems_only is not None:
kwargs["elems_only"] = self.elems_only
if self.attrs_only is not None:
kwargs["attrs_only"] = self.attrs_only
if self.names is not None:
kwargs["names"] = self.names
if self.dtype is not None:
kwargs["dtype"] = self.dtype
if self.converters is not None:
kwargs["converters"] = self.converters
if self.parse_dates is not None:
kwargs["parse_dates"] = self.parse_dates
if self.encoding is not None:
kwargs["encoding"] = self.encoding
if self.parser is not None:
kwargs["parser"] = self.parser
if self.encoding is not None:
kwargs["encoding"] = self.encoding
if self.parser is not None:
kwargs["parser"] = self.parser
if self.stylesheet is not None:
kwargs["stylesheet"] = self.stylesheet
if self.iterparse is not None:
kwargs["iterparse"] = self.iterparse
if self.compression is not None:
kwargs["compression"] = self.compression
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
if pd.__version__ >= "2.0" and self.dtype_backend is not None:
kwargs["dtype_backend"] = self.dtype_backend
return kwargs
def load_data(self, type: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
# Loads the data and returns the df and metadata of the xml
df = pd.read_xml(self.path_or_buffer, **self._get_loading_kwargs())
metadata = utils.get_file_and_dataframe_metadata(self.path_or_buffer, df)
return df, metadata
@classmethod
def name(cls) -> str:
return "xml"
@dataclasses.dataclass
class PandasXmlWriter(DataSaver):
"""Class specifically to handle saving xml files/buffers with Pandas.
Should map to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_xml.html
Requires `lxml`. See https://pandas.pydata.org/docs/getting_started/install.html#xml.
"""
path_or_buffer: Union[str, Path, BytesIO, BufferedReader]
# kwargs
index: bool = True
root_name: str = "data"
row_name: str = "row"
na_rep: Optional[str] = None
attr_cols: Optional[List[str]] = None
elems_cols: Optional[List[str]] = None
namespaces: Optional[Dict[str, str]] = None
prefix: Optional[str] = None
encoding: str = "utf-8"
xml_declaration: bool = True
pretty_print: bool = True
parser: str = "lxml"
stylesheet: Optional[Union[str, Path, BytesIO, BufferedReader]] = None
compression: Union[str, Dict[str, Any], None] = "infer"
storage_options: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_saving_kwargs(self):
kwargs = {}
if self.index is not None:
kwargs["index"] = self.index
if self.root_name is not None:
kwargs["root_name"] = self.root_name
if self.row_name is not None:
kwargs["row_name"] = self.row_name
if self.na_rep is not None:
kwargs["na_rep"] = self.na_rep
if self.attr_cols is not None:
kwargs["attr_cols"] = self.attr_cols
if self.elems_cols is not None:
kwargs["elems_cols"] = self.elems_cols
if self.namespaces is not None:
kwargs["namespaces"] = self.namespaces
if self.prefix is not None:
kwargs["prefix"] = self.prefix
if self.encoding is not None:
kwargs["encoding"] = self.encoding
if self.xml_declaration is not None:
kwargs["xml_declaration"] = self.xml_declaration
if self.pretty_print is not None:
kwargs["pretty_print"] = self.pretty_print
if self.parser is not None:
kwargs["parser"] = self.parser
if self.stylesheet is not None:
kwargs["stylesheet"] = self.stylesheet
if self.compression is not None:
kwargs["compression"] = self.compression
if self.storage_options is not None:
kwargs["storage_options"] = self.storage_options
return kwargs
def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
data.to_xml(self.path_or_buffer, **self._get_saving_kwargs())
return utils.get_file_and_dataframe_metadata(self.path_or_buffer, data)
@classmethod
def name(cls) -> str:
return "xml"
@dataclasses.dataclass
class PandasHtmlReader(DataLoader):
"""Class for loading/reading xml files with Pandas.
Maps to https://pandas.pydata.org/docs/reference/api/pandas.read_html.html
"""
io: Union[str, Path, BytesIO, BufferedReader]
# kwargs
match: Optional[str] = ".+"
flavor: Optional[Union[str, Sequence]] = None
header: Optional[Union[int, Sequence]] = None
index_col: Optional[Union[int, Sequence]] = None
skiprows: Optional[Union[int, Sequence, slice]] = None
attrs: Optional[Dict[str, str]] = None
parse_dates: Optional[bool] = None
thousands: Optional[str] = ","
encoding: Optional[str] = None
decimal: str = "."
converters: Optional[Dict[Any, Any]] = None
na_values: Iterable = None
keep_default_na: bool = True
displayed_only: bool = True
extract_links: Optional[Literal["header", "footer", "body", "all"]] = None
dtype_backend: Literal["pyarrow", "numpy_nullable"] = "numpy_nullable"
storage_options: Optional[Dict[str, Any]] = None
@classmethod
def applicable_types(cls) -> Collection[Type]:
return [DATAFRAME_TYPE]
def _get_loading_kwargs(self) -> Dict[str, Any]:
kwargs = {}
if self.match is not None:
kwargs["match"] = self.match
if self.flavor is not None:
kwargs["flavor"] = self.flavor
if self.header is not None:
kwargs["header"] = self.header
if self.index_col is not None:
kwargs["index_col"] = self.index_col
if self.skiprows is not None:
kwargs["skiprows"] = self.skiprows
if self.attrs is not None:
kwargs["attrs"] = self.attrs
if self.parse_dates is not None:
kwargs["parse_dates"] = self.parse_dates
if self.thousands is not None:
kwargs["thousands"] = self.thousands
if self.encoding is not None:
kwargs["encoding"] = self.encoding