Merge pull request #11 from BertrandBordage/master

Explicitely specify kwargs + update tests & docs.
chrisdev · Jan 28, 2014 · b68f140 · b68f140
2 parents 9eaedbe + 8e27ff9
commit b68f140
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 99 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,5 +4,6 @@ CHANGES
 tip (unreleased)
 ----------------
 
- - Added an io module to facilate the creation of DataFrames and saving of 
-   data form QuerySets
+ - Added an io module to ease the creation of DataFrames and saving of
+   data from QuerySets
+ - Syntax modified (see the docs)
diff --git a/README.rst b/README.rst
@@ -80,7 +80,7 @@ read_frame
 **Parameters**
 
     - qs: The Django QuerySet.
-    - fields: The model field names to use in creating the frame.
+    - fieldnames: The model field names to use in creating the frame.
               You can span a relationship in the usual Django way
               by using  double underscores to specify a related field
               in another model
@@ -119,7 +119,7 @@ as shown in the example below ::
         objects = DataFrameManager()
 
 
-This will qive you access to the following QuerySet methods:
+This will give you access to the following QuerySet methods:
 
     - ``to_datafame``
     - ``to_timeseries``
@@ -132,7 +132,7 @@ Returns a DataFrame from the QuerySet
 
 **Parameters**
 
-    - fields:  The model fields to utilise in creating the frame.
+    - fieldnames:  The model field names to utilise in creating the frame.
                 to span a relationship, just use the field name of related
                 fields across models, separated by double underscores,
 
@@ -142,9 +142,9 @@ Returns a DataFrame from the QuerySet
 
     - fill_na: fill in missing observations using one of the following
                     this is a string  specifying a pandas fill method
-                    {'backfill, 'bill', 'pad', 'ffill'} or a scalar value
+                    ('backfill, 'bill', 'pad', 'ffill') or a scalar value
 
-    - coerce_float: Attempt to convert the numeric non-string fields
+    - coerce_float: Attempt to convert the numeric non-string data
                     like object, decimal etc. to float if possible
 
 
@@ -155,14 +155,14 @@ Create a dataframe using all the fields  in your model as follows ::
 
     df = MyModel.to_dataframe()
 
-This will include you primary key create a DataFrame only from secified 
-fields::
+This will include you primary key create a DataFrame only from specified
+field names::
     
-     df = MyData.to_dataframe('age', 'department', 'wage')
+     df = MyData.to_dataframe(['age', 'department', 'wage'])
 
 To set ``full_name`` as the index ::
 
-    MyData.to_dataframe('age', 'department', 'wage', index='full_name')
+    MyData.to_dataframe(['age', 'department', 'wage', index='full_name'])
 
 You can use filters and excludes ::
 
@@ -176,7 +176,7 @@ DataFrame index is instance of a DateTime or PeriodIndex
 
 **Parameters**
 
-    - fields:  The model fields to utilise in creating the frame.
+    - fieldnames:  The model field names to utilise in creating the frame.
         to span a relationship, just use the field name of related
         fields across models, separated by double underscores,
 
@@ -278,18 +278,18 @@ Using a *wide* storage format ::
     rs_kwargs = {'how': 'sum', 'kind': 'period'}
     df = qs.to_timeseries(index='date_ix', pivot_columns='series_name',
                           values='value', storage='long',
-                        freq='M', rs_kwargs=rs_kwargs)
+                          freq='M', rs_kwargs=rs_kwargs)
 
 to_pivot_table
 --------------
 A convenience method for creating a pivot table from a QuerySet
 
 **Parameters**
 
-   - fields:  The model fields to utilise in creating the frame.
+   - fieldnames:  The model field names to utilise in creating the frame.
         to span a relationship, just use the field name of related
         fields across models, separated by double underscores,
-   -  values : column to aggregate, optional
+   - values : column to aggregate, optional
    - rows : list of column names or arrays to group on
         Keys to group on the x-axis of the pivot table
    - cols : list of column names or arrays to group on
@@ -302,7 +302,7 @@ A convenience method for creating a pivot table from a QuerySet
         Value to replace missing values with
    - margins : boolean, default False
         Add all row / columns (e.g. for subtotal / grand totals)
-   -  dropna : boolean, default True
+   - dropna : boolean, default True
 
 **Example**
 ::

diff --git a/django_pandas/io.py b/django_pandas/io.py
@@ -1,18 +1,19 @@
+from django.utils.encoding import force_text
 import pandas as pd
 
 
-def read_frame(qs, *fields, **kwargs):
+def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False):
     """
-    Returns a dataframe form a QuerySet
+    Returns a dataframe from a QuerySet
 
-    Optionally specify the fields/columns to utilize and
-    specify a fields as the index
+    Optionally specify the field names/columns to utilize and
+    a field as the index
 
     Parameters
     ----------
 
     qs: The Django QuerySet.
-    fields: The model field names to use in creating the frame.
+    fieldnames: The model field names to use in creating the frame.
          You can span a relationship in the usual Django way
          by using  double underscores to specify a related field
          in another model
@@ -23,24 +24,23 @@ def read_frame(qs, *fields, **kwargs):
     index_col: specify the field to use  for the index. If the index
                field is not in the field list it will be appended
 
-    coerce_float : boolean, default True
-        Attempt to convert values to non-string, non-numeric objects (like
+    coerce_float : boolean, default False
+        Attempt to convert values to non-string, non-numeric data (like
         decimal.Decimal) to floating point, useful for SQL result sets
    """
 
-    index_col = kwargs.pop('index_col', None)
-    coerce_float = kwargs.pop('coerce_float', False)
-    if not fields:
-        fields = tuple([f.name for f in qs.model._meta.fields])
+    if fieldnames:
+        if index_col is not None and index_col not in fieldnames:
+            # Add it to the field names if not already there
+            fieldnames = tuple(fieldnames) + (index_col,)
 
-    if index_col is not None:
-        # add it to the fields if not already there
-        if index_col not in fields:
-            fields = fields + (index_col,)
+    else:
+        fields = qs.model._meta.fields
+        fieldnames = [f.name for f in fields]
 
-    recs = list(qs.values_list(*fields))
+    recs = list(qs.values_list(*fieldnames))
 
-    df = pd.DataFrame.from_records(recs, columns=fields,
+    df = pd.DataFrame.from_records(recs, columns=fieldnames,
                                    coerce_float=coerce_float)
     if index_col is not None:
         df = df.set_index(index_col)

diff --git a/django_pandas/managers.py b/django_pandas/managers.py
@@ -1,19 +1,20 @@
 from django.db.models.query import QuerySet
-import numpy as np
-import pandas as pd
 from model_utils.managers import PassThroughManager
+from .io import read_frame
 
 
 class DataFrameQuerySet(QuerySet):
 
-    def to_pivot_table(self, *fields, **kwargs):
+    def to_pivot_table(self, fieldnames=(), values=None, rows=None, cols=None,
+                       aggfunc='mean', fill_value=None, margins=False,
+                       dropna=True):
         """
         A convenience method for creating a time series i.e the
         DataFrame index is instance of a DateTime or PeriodIndex
 
         Parameters
         ----------
-        fields:  The model fields to utilise in creating the frame.
+        fieldnames:  The model field names to utilise in creating the frame.
             to span a relationship, just use the field name of related
             fields across models, separated by double underscores,
         values : column to aggregate, optional
@@ -32,35 +33,26 @@ def to_pivot_table(self, *fields, **kwargs):
         dropna : boolean, default True
         Do not include columns whose entries are all NaN
         """
-        df = self.to_dataframe(*fields)
-        values = kwargs.pop('values')
-        rows = kwargs.pop('rows')
-        cols = kwargs.pop('cols')
-        aggfunc = kwargs.pop('aggfunc', np.mean)
-        fill_value = kwargs.pop('fill_value', None)
-        margins = kwargs.pop('margins', False)
-        dropna = kwargs.pop('dropna', False)
-
-        return pd.pivot_table(df, values=values,
-                              fill_value=fill_value,
-                              rows=rows, cols=cols,
-                              aggfunc=aggfunc,
-                              margins=margins,
+        df = self.to_dataframe(fieldnames)
+
+        return df.pivot_table(values=values, fill_value=fill_value, rows=rows,
+                              cols=cols, aggfunc=aggfunc, margins=margins,
                               dropna=dropna)
 
-    def to_timeseries(self, *fields, **kwargs):
+    def to_timeseries(self, fieldnames=(), index=None, storage='wide', values=None,
+                      pivot_columns=None, freq=None, rs_kwargs=None):
         """
         A convenience method for creating a time series i.e the
         DataFrame index is instance of a DateTime or PeriodIndex
 
         Parameters
         ----------
 
-        fields:  The model fields to utilise in creating the frame.
+        fieldnames:  The model field names to utilise in creating the frame.
             to span a relationship, just use the field name of related
             fields across models, separated by double underscores,
 
-       index: specify the field to use  for the index. If the index
+        index: specify the field to use  for the index. If the index
             field is not in the field list it will be appended. This
             is mandatory.
 
@@ -83,29 +75,24 @@ def to_timeseries(self, *fields, **kwargs):
 
         rs_kwargs: Arguments based on pandas.DataFrame.resample
         """
-        index = kwargs.pop('index', None)
-
-        if not index:
+        if index is None:
             raise AssertionError('You must supply an index field')
-
-        storage = kwargs.get('storage', 'wide')
-
-        if storage not in ['wide', 'long']:
+        if storage not in ('wide', 'long'):
             raise AssertionError('storage must be wide or long')
+        if rs_kwargs is None:
+            rs_kwargs = {}
 
         if storage == 'wide':
-            df = self.to_dataframe(*fields, index=index)
+            df = self.to_dataframe(fieldnames, index=index)
         else:
-            df = self.to_dataframe(*fields)
-            values = kwargs.get('values', None)
+            df = self.to_dataframe(fieldnames)
             if values is None:
                 raise AssertionError('You must specify a values field')
 
-            pivot_columns = kwargs.get('pivot_columns', None)
             if pivot_columns is None:
                 raise AssertionError('You must specify pivot_columns')
 
-            if isinstance(pivot_columns, list):
+            if isinstance(pivot_columns, (tuple, list)):
                 df['combined_keys'] = ''
                 for c in pivot_columns:
                     df['combined_keys'] += df[c].str.upper() + '.'
@@ -119,25 +106,21 @@ def to_timeseries(self, *fields, **kwargs):
                 df = df.pivot(index=index,
                               columns=pivot_columns,
                               values=values)
-        rule = kwargs.get('freq', None)
 
-        if rule:
-            rs_kwargs = kwargs.get('rs_kwargs', None)
-            if rs_kwargs:
-                df = df.resample(rule, **rs_kwargs)
-            else:
-                df = df.resample(rule)
+        if freq is not None:
+            df = df.resample(freq, **rs_kwargs)
 
         return df
 
-    def to_dataframe(self, *fields, **kwargs):
+    def to_dataframe(self, fieldnames=(), index=None, fill_na=None,
+                     coerce_float=False):
         """
         Returns a DataFrame from the queryset
 
         Paramaters
         -----------
 
-        fields:  The model fields to utilise in creating the frame.
+        fieldnames:  The model fields to utilise in creating the frame.
             to span a relationship, just use the field name of related
             fields across models, separated by double underscores,
 
@@ -149,29 +132,15 @@ def to_dataframe(self, *fields, **kwargs):
                  this is a string  specifying a pandas fill method
                  {'backfill, 'bill', 'pad', 'ffill'} or a scalar value
 
-        coerce_float: Attempt to convert the numeric non-string fields
+        coerce_float: Attempt to convert the numeric non-string data
                 like object, decimal etc. to float if possible
         """
-        index = kwargs.pop('index', None)
-        fill_na = kwargs.pop('fill_na', None)
-        coerce_float = kwargs.pop('coerce_float', False)
-        if not fields:
-            fields = tuple(self.model._meta.get_all_field_names())
-
-        if index is not None:
-            # add it to the fields if not already there
-            if index not in fields:
-                fields = fields + (index,)
-
-        qs = self.values_list(*fields)
-        recs = np.core.records.fromrecords(qs, names=qs.field_names)
 
-        df = pd.DataFrame.from_records(recs, coerce_float=coerce_float)
-        if index is not None:
-            df = df.set_index(index)
+        df = read_frame(self, fieldnames=fieldnames, index_col=index,
+                        coerce_float=coerce_float)
 
         if fill_na is not None:
-            if fill_na not in ['backfill', 'bfill', 'pad', 'ffill']:
+            if fill_na not in ('backfill', 'bfill', 'pad', 'ffill'):
                 df = df.fillna(value=fill_na)
             else:
                 df = df.fillna(method=fill_na)

diff --git a/django_pandas/tests/test_io.py b/django_pandas/tests/test_io.py
@@ -32,14 +32,14 @@ def test_basic(self):
         df = read_frame(qs)
         n, c = df.shape
         self.assertEqual(n, qs.count())
-        flds = MyModel._meta.get_all_field_names()
-        self.assertEqual(c, len(flds))
-        df1 = read_frame(qs, 'col1', 'col2')
+        fields = MyModel._meta.get_all_field_names()
+        self.assertEqual(c, len(fields))
+        df1 = read_frame(qs, ['col1', 'col2'])
         self.assertEqual(df1.shape, (qs.count(), 2))
 
     def test_index(self):
         qs = MyModel.objects.all()
-        df = read_frame(qs, 'col1', 'col2', 'col3', 'col4',
+        df = read_frame(qs, ['col1', 'col2', 'col3', 'col4'],
                         index_col='index_col')
         self.assertEqual(df.shape, (qs.count(), 4))
         self.assertEqual(set(df.index.tolist()),

diff --git a/django_pandas/tests/test_manager.py b/django_pandas/tests/test_manager.py
@@ -27,7 +27,7 @@ def setUp(self):
                 col4=cols['col4']
             )
 
-    def test_dataframae(self):
+    def test_dataframe(self):
         qs = DataFrame.objects.all()
         df = qs.to_dataframe()
 
@@ -36,7 +36,7 @@ def test_dataframae(self):
         flds = DataFrame._meta.get_all_field_names()
         self.assertEqual(c, len(flds))
         qs2 = DataFrame.objects.filter(index__in=['a', 'b', 'c'])
-        df2 = qs2.to_dataframe('col1', 'col2', 'col3', index_field='index')
+        df2 = qs2.to_dataframe(['col1', 'col2', 'col3'], index='index')
         n, c = df2.shape
         self.assertEqual((n, c), (3, 3))