diff --git a/CHANGES.rst b/CHANGES.rst index 2c160e8..0a9565b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,5 +4,6 @@ CHANGES tip (unreleased) ---------------- - - Added an io module to facilate the creation of DataFrames and saving of - data form QuerySets + - Added an io module to ease the creation of DataFrames and saving of + data from QuerySets + - Syntax modified (see the docs) diff --git a/README.rst b/README.rst index 7b24c32..b246fc5 100644 --- a/README.rst +++ b/README.rst @@ -80,7 +80,7 @@ read_frame **Parameters** - qs: The Django QuerySet. - - fields: The model field names to use in creating the frame. + - fieldnames: The model field names to use in creating the frame. You can span a relationship in the usual Django way by using double underscores to specify a related field in another model @@ -119,7 +119,7 @@ as shown in the example below :: objects = DataFrameManager() -This will qive you access to the following QuerySet methods: +This will give you access to the following QuerySet methods: - ``to_datafame`` - ``to_timeseries`` @@ -132,7 +132,7 @@ Returns a DataFrame from the QuerySet **Parameters** - - fields: The model fields to utilise in creating the frame. + - fieldnames: The model field names to utilise in creating the frame. to span a relationship, just use the field name of related fields across models, separated by double underscores, @@ -142,9 +142,9 @@ Returns a DataFrame from the QuerySet - fill_na: fill in missing observations using one of the following this is a string specifying a pandas fill method - {'backfill, 'bill', 'pad', 'ffill'} or a scalar value + ('backfill, 'bill', 'pad', 'ffill') or a scalar value - - coerce_float: Attempt to convert the numeric non-string fields + - coerce_float: Attempt to convert the numeric non-string data like object, decimal etc. to float if possible @@ -155,14 +155,14 @@ Create a dataframe using all the fields in your model as follows :: df = MyModel.to_dataframe() -This will include you primary key create a DataFrame only from secified -fields:: +This will include you primary key create a DataFrame only from specified +field names:: - df = MyData.to_dataframe('age', 'department', 'wage') + df = MyData.to_dataframe(['age', 'department', 'wage']) To set ``full_name`` as the index :: - MyData.to_dataframe('age', 'department', 'wage', index='full_name') + MyData.to_dataframe(['age', 'department', 'wage', index='full_name']) You can use filters and excludes :: @@ -176,7 +176,7 @@ DataFrame index is instance of a DateTime or PeriodIndex **Parameters** - - fields: The model fields to utilise in creating the frame. + - fieldnames: The model field names to utilise in creating the frame. to span a relationship, just use the field name of related fields across models, separated by double underscores, @@ -278,7 +278,7 @@ Using a *wide* storage format :: rs_kwargs = {'how': 'sum', 'kind': 'period'} df = qs.to_timeseries(index='date_ix', pivot_columns='series_name', values='value', storage='long', - freq='M', rs_kwargs=rs_kwargs) + freq='M', rs_kwargs=rs_kwargs) to_pivot_table -------------- @@ -286,10 +286,10 @@ A convenience method for creating a pivot table from a QuerySet **Parameters** - - fields: The model fields to utilise in creating the frame. + - fieldnames: The model field names to utilise in creating the frame. to span a relationship, just use the field name of related fields across models, separated by double underscores, - - values : column to aggregate, optional + - values : column to aggregate, optional - rows : list of column names or arrays to group on Keys to group on the x-axis of the pivot table - cols : list of column names or arrays to group on @@ -302,7 +302,7 @@ A convenience method for creating a pivot table from a QuerySet Value to replace missing values with - margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) - - dropna : boolean, default True + - dropna : boolean, default True **Example** :: diff --git a/django_pandas/io.py b/django_pandas/io.py index 17a82e9..84deab5 100644 --- a/django_pandas/io.py +++ b/django_pandas/io.py @@ -1,18 +1,19 @@ +from django.utils.encoding import force_text import pandas as pd -def read_frame(qs, *fields, **kwargs): +def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False): """ - Returns a dataframe form a QuerySet + Returns a dataframe from a QuerySet - Optionally specify the fields/columns to utilize and - specify a fields as the index + Optionally specify the field names/columns to utilize and + a field as the index Parameters ---------- qs: The Django QuerySet. - fields: The model field names to use in creating the frame. + fieldnames: The model field names to use in creating the frame. You can span a relationship in the usual Django way by using double underscores to specify a related field in another model @@ -23,24 +24,23 @@ def read_frame(qs, *fields, **kwargs): index_col: specify the field to use for the index. If the index field is not in the field list it will be appended - coerce_float : boolean, default True - Attempt to convert values to non-string, non-numeric objects (like + coerce_float : boolean, default False + Attempt to convert values to non-string, non-numeric data (like decimal.Decimal) to floating point, useful for SQL result sets """ - index_col = kwargs.pop('index_col', None) - coerce_float = kwargs.pop('coerce_float', False) - if not fields: - fields = tuple([f.name for f in qs.model._meta.fields]) + if fieldnames: + if index_col is not None and index_col not in fieldnames: + # Add it to the field names if not already there + fieldnames = tuple(fieldnames) + (index_col,) - if index_col is not None: - # add it to the fields if not already there - if index_col not in fields: - fields = fields + (index_col,) + else: + fields = qs.model._meta.fields + fieldnames = [f.name for f in fields] - recs = list(qs.values_list(*fields)) + recs = list(qs.values_list(*fieldnames)) - df = pd.DataFrame.from_records(recs, columns=fields, + df = pd.DataFrame.from_records(recs, columns=fieldnames, coerce_float=coerce_float) if index_col is not None: df = df.set_index(index_col) diff --git a/django_pandas/managers.py b/django_pandas/managers.py index 4894a79..2f38c8c 100644 --- a/django_pandas/managers.py +++ b/django_pandas/managers.py @@ -1,19 +1,20 @@ from django.db.models.query import QuerySet -import numpy as np -import pandas as pd from model_utils.managers import PassThroughManager +from .io import read_frame class DataFrameQuerySet(QuerySet): - def to_pivot_table(self, *fields, **kwargs): + def to_pivot_table(self, fieldnames=(), values=None, rows=None, cols=None, + aggfunc='mean', fill_value=None, margins=False, + dropna=True): """ A convenience method for creating a time series i.e the DataFrame index is instance of a DateTime or PeriodIndex Parameters ---------- - fields: The model fields to utilise in creating the frame. + fieldnames: The model field names to utilise in creating the frame. to span a relationship, just use the field name of related fields across models, separated by double underscores, values : column to aggregate, optional @@ -32,23 +33,14 @@ def to_pivot_table(self, *fields, **kwargs): dropna : boolean, default True Do not include columns whose entries are all NaN """ - df = self.to_dataframe(*fields) - values = kwargs.pop('values') - rows = kwargs.pop('rows') - cols = kwargs.pop('cols') - aggfunc = kwargs.pop('aggfunc', np.mean) - fill_value = kwargs.pop('fill_value', None) - margins = kwargs.pop('margins', False) - dropna = kwargs.pop('dropna', False) - - return pd.pivot_table(df, values=values, - fill_value=fill_value, - rows=rows, cols=cols, - aggfunc=aggfunc, - margins=margins, + df = self.to_dataframe(fieldnames) + + return df.pivot_table(values=values, fill_value=fill_value, rows=rows, + cols=cols, aggfunc=aggfunc, margins=margins, dropna=dropna) - def to_timeseries(self, *fields, **kwargs): + def to_timeseries(self, fieldnames=(), index=None, storage='wide', values=None, + pivot_columns=None, freq=None, rs_kwargs=None): """ A convenience method for creating a time series i.e the DataFrame index is instance of a DateTime or PeriodIndex @@ -56,11 +48,11 @@ def to_timeseries(self, *fields, **kwargs): Parameters ---------- - fields: The model fields to utilise in creating the frame. + fieldnames: The model field names to utilise in creating the frame. to span a relationship, just use the field name of related fields across models, separated by double underscores, - index: specify the field to use for the index. If the index + index: specify the field to use for the index. If the index field is not in the field list it will be appended. This is mandatory. @@ -83,29 +75,24 @@ def to_timeseries(self, *fields, **kwargs): rs_kwargs: Arguments based on pandas.DataFrame.resample """ - index = kwargs.pop('index', None) - - if not index: + if index is None: raise AssertionError('You must supply an index field') - - storage = kwargs.get('storage', 'wide') - - if storage not in ['wide', 'long']: + if storage not in ('wide', 'long'): raise AssertionError('storage must be wide or long') + if rs_kwargs is None: + rs_kwargs = {} if storage == 'wide': - df = self.to_dataframe(*fields, index=index) + df = self.to_dataframe(fieldnames, index=index) else: - df = self.to_dataframe(*fields) - values = kwargs.get('values', None) + df = self.to_dataframe(fieldnames) if values is None: raise AssertionError('You must specify a values field') - pivot_columns = kwargs.get('pivot_columns', None) if pivot_columns is None: raise AssertionError('You must specify pivot_columns') - if isinstance(pivot_columns, list): + if isinstance(pivot_columns, (tuple, list)): df['combined_keys'] = '' for c in pivot_columns: df['combined_keys'] += df[c].str.upper() + '.' @@ -119,25 +106,21 @@ def to_timeseries(self, *fields, **kwargs): df = df.pivot(index=index, columns=pivot_columns, values=values) - rule = kwargs.get('freq', None) - if rule: - rs_kwargs = kwargs.get('rs_kwargs', None) - if rs_kwargs: - df = df.resample(rule, **rs_kwargs) - else: - df = df.resample(rule) + if freq is not None: + df = df.resample(freq, **rs_kwargs) return df - def to_dataframe(self, *fields, **kwargs): + def to_dataframe(self, fieldnames=(), index=None, fill_na=None, + coerce_float=False): """ Returns a DataFrame from the queryset Paramaters ----------- - fields: The model fields to utilise in creating the frame. + fieldnames: The model fields to utilise in creating the frame. to span a relationship, just use the field name of related fields across models, separated by double underscores, @@ -149,29 +132,15 @@ def to_dataframe(self, *fields, **kwargs): this is a string specifying a pandas fill method {'backfill, 'bill', 'pad', 'ffill'} or a scalar value - coerce_float: Attempt to convert the numeric non-string fields + coerce_float: Attempt to convert the numeric non-string data like object, decimal etc. to float if possible """ - index = kwargs.pop('index', None) - fill_na = kwargs.pop('fill_na', None) - coerce_float = kwargs.pop('coerce_float', False) - if not fields: - fields = tuple(self.model._meta.get_all_field_names()) - - if index is not None: - # add it to the fields if not already there - if index not in fields: - fields = fields + (index,) - - qs = self.values_list(*fields) - recs = np.core.records.fromrecords(qs, names=qs.field_names) - df = pd.DataFrame.from_records(recs, coerce_float=coerce_float) - if index is not None: - df = df.set_index(index) + df = read_frame(self, fieldnames=fieldnames, index_col=index, + coerce_float=coerce_float) if fill_na is not None: - if fill_na not in ['backfill', 'bfill', 'pad', 'ffill']: + if fill_na not in ('backfill', 'bfill', 'pad', 'ffill'): df = df.fillna(value=fill_na) else: df = df.fillna(method=fill_na) diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py index 5d8d987..1866f56 100644 --- a/django_pandas/tests/test_io.py +++ b/django_pandas/tests/test_io.py @@ -32,14 +32,14 @@ def test_basic(self): df = read_frame(qs) n, c = df.shape self.assertEqual(n, qs.count()) - flds = MyModel._meta.get_all_field_names() - self.assertEqual(c, len(flds)) - df1 = read_frame(qs, 'col1', 'col2') + fields = MyModel._meta.get_all_field_names() + self.assertEqual(c, len(fields)) + df1 = read_frame(qs, ['col1', 'col2']) self.assertEqual(df1.shape, (qs.count(), 2)) def test_index(self): qs = MyModel.objects.all() - df = read_frame(qs, 'col1', 'col2', 'col3', 'col4', + df = read_frame(qs, ['col1', 'col2', 'col3', 'col4'], index_col='index_col') self.assertEqual(df.shape, (qs.count(), 4)) self.assertEqual(set(df.index.tolist()), diff --git a/django_pandas/tests/test_manager.py b/django_pandas/tests/test_manager.py index 66c1774..68d0b8b 100644 --- a/django_pandas/tests/test_manager.py +++ b/django_pandas/tests/test_manager.py @@ -27,7 +27,7 @@ def setUp(self): col4=cols['col4'] ) - def test_dataframae(self): + def test_dataframe(self): qs = DataFrame.objects.all() df = qs.to_dataframe() @@ -36,7 +36,7 @@ def test_dataframae(self): flds = DataFrame._meta.get_all_field_names() self.assertEqual(c, len(flds)) qs2 = DataFrame.objects.filter(index__in=['a', 'b', 'c']) - df2 = qs2.to_dataframe('col1', 'col2', 'col3', index_field='index') + df2 = qs2.to_dataframe(['col1', 'col2', 'col3'], index='index') n, c = df2.shape self.assertEqual((n, c), (3, 3))