Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ CHANGES
tip (unreleased)
----------------

- Added an io module to facilate the creation of DataFrames and saving of
data form QuerySets
- Added an io module to ease the creation of DataFrames and saving of
data from QuerySets
- Syntax modified (see the docs)
28 changes: 14 additions & 14 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ read_frame
**Parameters**

- qs: The Django QuerySet.
- fields: The model field names to use in creating the frame.
- fieldnames: The model field names to use in creating the frame.
You can span a relationship in the usual Django way
by using double underscores to specify a related field
in another model
Expand Down Expand Up @@ -119,7 +119,7 @@ as shown in the example below ::
objects = DataFrameManager()


This will qive you access to the following QuerySet methods:
This will give you access to the following QuerySet methods:

- ``to_datafame``
- ``to_timeseries``
Expand All @@ -132,7 +132,7 @@ Returns a DataFrame from the QuerySet

**Parameters**

- fields: The model fields to utilise in creating the frame.
- fieldnames: The model field names to utilise in creating the frame.
to span a relationship, just use the field name of related
fields across models, separated by double underscores,

Expand All @@ -142,9 +142,9 @@ Returns a DataFrame from the QuerySet

- fill_na: fill in missing observations using one of the following
this is a string specifying a pandas fill method
{'backfill, 'bill', 'pad', 'ffill'} or a scalar value
('backfill, 'bill', 'pad', 'ffill') or a scalar value

- coerce_float: Attempt to convert the numeric non-string fields
- coerce_float: Attempt to convert the numeric non-string data
like object, decimal etc. to float if possible


Expand All @@ -155,14 +155,14 @@ Create a dataframe using all the fields in your model as follows ::

df = MyModel.to_dataframe()

This will include you primary key create a DataFrame only from secified
fields::
This will include you primary key create a DataFrame only from specified
field names::

df = MyData.to_dataframe('age', 'department', 'wage')
df = MyData.to_dataframe(['age', 'department', 'wage'])

To set ``full_name`` as the index ::

MyData.to_dataframe('age', 'department', 'wage', index='full_name')
MyData.to_dataframe(['age', 'department', 'wage', index='full_name'])

You can use filters and excludes ::

Expand All @@ -176,7 +176,7 @@ DataFrame index is instance of a DateTime or PeriodIndex

**Parameters**

- fields: The model fields to utilise in creating the frame.
- fieldnames: The model field names to utilise in creating the frame.
to span a relationship, just use the field name of related
fields across models, separated by double underscores,

Expand Down Expand Up @@ -278,18 +278,18 @@ Using a *wide* storage format ::
rs_kwargs = {'how': 'sum', 'kind': 'period'}
df = qs.to_timeseries(index='date_ix', pivot_columns='series_name',
values='value', storage='long',
freq='M', rs_kwargs=rs_kwargs)
freq='M', rs_kwargs=rs_kwargs)

to_pivot_table
--------------
A convenience method for creating a pivot table from a QuerySet

**Parameters**

- fields: The model fields to utilise in creating the frame.
- fieldnames: The model field names to utilise in creating the frame.
to span a relationship, just use the field name of related
fields across models, separated by double underscores,
- values : column to aggregate, optional
- values : column to aggregate, optional
- rows : list of column names or arrays to group on
Keys to group on the x-axis of the pivot table
- cols : list of column names or arrays to group on
Expand All @@ -302,7 +302,7 @@ A convenience method for creating a pivot table from a QuerySet
Value to replace missing values with
- margins : boolean, default False
Add all row / columns (e.g. for subtotal / grand totals)
- dropna : boolean, default True
- dropna : boolean, default True

**Example**
::
Expand Down
34 changes: 17 additions & 17 deletions django_pandas/io.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
from django.utils.encoding import force_text
import pandas as pd


def read_frame(qs, *fields, **kwargs):
def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False):
"""
Returns a dataframe form a QuerySet
Returns a dataframe from a QuerySet

Optionally specify the fields/columns to utilize and
specify a fields as the index
Optionally specify the field names/columns to utilize and
a field as the index

Parameters
----------

qs: The Django QuerySet.
fields: The model field names to use in creating the frame.
fieldnames: The model field names to use in creating the frame.
You can span a relationship in the usual Django way
by using double underscores to specify a related field
in another model
Expand All @@ -23,24 +24,23 @@ def read_frame(qs, *fields, **kwargs):
index_col: specify the field to use for the index. If the index
field is not in the field list it will be appended

coerce_float : boolean, default True
Attempt to convert values to non-string, non-numeric objects (like
coerce_float : boolean, default False
Attempt to convert values to non-string, non-numeric data (like
decimal.Decimal) to floating point, useful for SQL result sets
"""

index_col = kwargs.pop('index_col', None)
coerce_float = kwargs.pop('coerce_float', False)
if not fields:
fields = tuple([f.name for f in qs.model._meta.fields])
if fieldnames:
if index_col is not None and index_col not in fieldnames:
# Add it to the field names if not already there
fieldnames = tuple(fieldnames) + (index_col,)

if index_col is not None:
# add it to the fields if not already there
if index_col not in fields:
fields = fields + (index_col,)
else:
fields = qs.model._meta.fields
fieldnames = [f.name for f in fields]

recs = list(qs.values_list(*fields))
recs = list(qs.values_list(*fieldnames))

df = pd.DataFrame.from_records(recs, columns=fields,
df = pd.DataFrame.from_records(recs, columns=fieldnames,
coerce_float=coerce_float)
if index_col is not None:
df = df.set_index(index_col)
Expand Down
89 changes: 29 additions & 60 deletions django_pandas/managers.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from django.db.models.query import QuerySet
import numpy as np
import pandas as pd
from model_utils.managers import PassThroughManager
from .io import read_frame


class DataFrameQuerySet(QuerySet):

def to_pivot_table(self, *fields, **kwargs):
def to_pivot_table(self, fieldnames=(), values=None, rows=None, cols=None,
aggfunc='mean', fill_value=None, margins=False,
dropna=True):
"""
A convenience method for creating a time series i.e the
DataFrame index is instance of a DateTime or PeriodIndex

Parameters
----------
fields: The model fields to utilise in creating the frame.
fieldnames: The model field names to utilise in creating the frame.
to span a relationship, just use the field name of related
fields across models, separated by double underscores,
values : column to aggregate, optional
Expand All @@ -32,35 +33,26 @@ def to_pivot_table(self, *fields, **kwargs):
dropna : boolean, default True
Do not include columns whose entries are all NaN
"""
df = self.to_dataframe(*fields)
values = kwargs.pop('values')
rows = kwargs.pop('rows')
cols = kwargs.pop('cols')
aggfunc = kwargs.pop('aggfunc', np.mean)
fill_value = kwargs.pop('fill_value', None)
margins = kwargs.pop('margins', False)
dropna = kwargs.pop('dropna', False)

return pd.pivot_table(df, values=values,
fill_value=fill_value,
rows=rows, cols=cols,
aggfunc=aggfunc,
margins=margins,
df = self.to_dataframe(fieldnames)

return df.pivot_table(values=values, fill_value=fill_value, rows=rows,
cols=cols, aggfunc=aggfunc, margins=margins,
dropna=dropna)

def to_timeseries(self, *fields, **kwargs):
def to_timeseries(self, fieldnames=(), index=None, storage='wide', values=None,
pivot_columns=None, freq=None, rs_kwargs=None):
"""
A convenience method for creating a time series i.e the
DataFrame index is instance of a DateTime or PeriodIndex

Parameters
----------

fields: The model fields to utilise in creating the frame.
fieldnames: The model field names to utilise in creating the frame.
to span a relationship, just use the field name of related
fields across models, separated by double underscores,

index: specify the field to use for the index. If the index
index: specify the field to use for the index. If the index
field is not in the field list it will be appended. This
is mandatory.

Expand All @@ -83,29 +75,24 @@ def to_timeseries(self, *fields, **kwargs):

rs_kwargs: Arguments based on pandas.DataFrame.resample
"""
index = kwargs.pop('index', None)

if not index:
if index is None:
raise AssertionError('You must supply an index field')

storage = kwargs.get('storage', 'wide')

if storage not in ['wide', 'long']:
if storage not in ('wide', 'long'):
raise AssertionError('storage must be wide or long')
if rs_kwargs is None:
rs_kwargs = {}

if storage == 'wide':
df = self.to_dataframe(*fields, index=index)
df = self.to_dataframe(fieldnames, index=index)
else:
df = self.to_dataframe(*fields)
values = kwargs.get('values', None)
df = self.to_dataframe(fieldnames)
if values is None:
raise AssertionError('You must specify a values field')

pivot_columns = kwargs.get('pivot_columns', None)
if pivot_columns is None:
raise AssertionError('You must specify pivot_columns')

if isinstance(pivot_columns, list):
if isinstance(pivot_columns, (tuple, list)):
df['combined_keys'] = ''
for c in pivot_columns:
df['combined_keys'] += df[c].str.upper() + '.'
Expand All @@ -119,25 +106,21 @@ def to_timeseries(self, *fields, **kwargs):
df = df.pivot(index=index,
columns=pivot_columns,
values=values)
rule = kwargs.get('freq', None)

if rule:
rs_kwargs = kwargs.get('rs_kwargs', None)
if rs_kwargs:
df = df.resample(rule, **rs_kwargs)
else:
df = df.resample(rule)
if freq is not None:
df = df.resample(freq, **rs_kwargs)

return df

def to_dataframe(self, *fields, **kwargs):
def to_dataframe(self, fieldnames=(), index=None, fill_na=None,
coerce_float=False):
"""
Returns a DataFrame from the queryset

Paramaters
-----------

fields: The model fields to utilise in creating the frame.
fieldnames: The model fields to utilise in creating the frame.
to span a relationship, just use the field name of related
fields across models, separated by double underscores,

Expand All @@ -149,29 +132,15 @@ def to_dataframe(self, *fields, **kwargs):
this is a string specifying a pandas fill method
{'backfill, 'bill', 'pad', 'ffill'} or a scalar value

coerce_float: Attempt to convert the numeric non-string fields
coerce_float: Attempt to convert the numeric non-string data
like object, decimal etc. to float if possible
"""
index = kwargs.pop('index', None)
fill_na = kwargs.pop('fill_na', None)
coerce_float = kwargs.pop('coerce_float', False)
if not fields:
fields = tuple(self.model._meta.get_all_field_names())

if index is not None:
# add it to the fields if not already there
if index not in fields:
fields = fields + (index,)

qs = self.values_list(*fields)
recs = np.core.records.fromrecords(qs, names=qs.field_names)

df = pd.DataFrame.from_records(recs, coerce_float=coerce_float)
if index is not None:
df = df.set_index(index)
df = read_frame(self, fieldnames=fieldnames, index_col=index,
coerce_float=coerce_float)

if fill_na is not None:
if fill_na not in ['backfill', 'bfill', 'pad', 'ffill']:
if fill_na not in ('backfill', 'bfill', 'pad', 'ffill'):
df = df.fillna(value=fill_na)
else:
df = df.fillna(method=fill_na)
Expand Down
8 changes: 4 additions & 4 deletions django_pandas/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ def test_basic(self):
df = read_frame(qs)
n, c = df.shape
self.assertEqual(n, qs.count())
flds = MyModel._meta.get_all_field_names()
self.assertEqual(c, len(flds))
df1 = read_frame(qs, 'col1', 'col2')
fields = MyModel._meta.get_all_field_names()
self.assertEqual(c, len(fields))
df1 = read_frame(qs, ['col1', 'col2'])
self.assertEqual(df1.shape, (qs.count(), 2))

def test_index(self):
qs = MyModel.objects.all()
df = read_frame(qs, 'col1', 'col2', 'col3', 'col4',
df = read_frame(qs, ['col1', 'col2', 'col3', 'col4'],
index_col='index_col')
self.assertEqual(df.shape, (qs.count(), 4))
self.assertEqual(set(df.index.tolist()),
Expand Down
4 changes: 2 additions & 2 deletions django_pandas/tests/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def setUp(self):
col4=cols['col4']
)

def test_dataframae(self):
def test_dataframe(self):
qs = DataFrame.objects.all()
df = qs.to_dataframe()

Expand All @@ -36,7 +36,7 @@ def test_dataframae(self):
flds = DataFrame._meta.get_all_field_names()
self.assertEqual(c, len(flds))
qs2 = DataFrame.objects.filter(index__in=['a', 'b', 'c'])
df2 = qs2.to_dataframe('col1', 'col2', 'col3', index_field='index')
df2 = qs2.to_dataframe(['col1', 'col2', 'col3'], index='index')
n, c = df2.shape
self.assertEqual((n, c), (3, 3))

Expand Down