## [[Discourse] AttributeError: 'DataFrame' object has no attribute 'repartition' #207](https://github.com/coiled/dask-community/issues/207)

User is trying to do: read data --> filter --> convert some cols to tensors

### Original with code with toy data

In [1]:
import dask
# import torch

import pandas as pd
import dask.dataframe as dd

In [2]:
df = pd.DataFrame({'userID': ['1', '1', '2', '3'], 'questionID': ['a', 'b', 'b', 'c'], 'result': ['99', '100', '98', '97']})
df

Unnamed: 0,userID,questionID,result
0,1,a,99
1,1,b,100
2,2,b,98
3,3,c,97


In [3]:
ddf = dd.from_pandas(df, npartitions=2)

In [5]:
MAX_LEN = 300
users = df.userID.unique()
paths = []
labels = []

In [6]:
# @dask.delayed(nout=2)
def get_paths(df, user):
    temp = df.loc[df.userID == user]
    npart = round(len(temp)/MAX_LEN)
    if npart == 0:
        return (temp.questionID.values, temp.result.values)
    else:
        parted_df = temp.repartition(npartitions=npart)
        return (parted_df.partitions[0].questionID.values,
                parted_df.partitions[0].result.values)

In [7]:
# @dask.delayed
def convert_to_tensor(x):
    # return torch.tensor(x)
    return x

In [8]:
for user in users:
    path, label = get_paths(df, user)
    paths.append(convert_to_tensor(path))
    labels.append(convert_to_tensor(label))

In [None]:
dask.compute(*paths)

In [9]:
dask.compute(*labels)

In [9]:
paths

[array(['a', 'b'], dtype=object),
 array(['b'], dtype=object),
 array(['c'], dtype=object)]

In [10]:
labels

[array(['99', '100'], dtype=object),
 array(['98'], dtype=object),
 array(['97'], dtype=object)]

---

### User's updated code

In [None]:
trunc_df = pd.DataFrame()
for user in tqdm(df.userID.unique()):
    user_df = df.loc[df.userID == user].iloc[-MAX_LEN:]
    trunc_df = pd.concat([trunc_df, user_df])

In [None]:
@dask.delayed(nout=2)
def get_paths(df, user):
    temp = df.loc[df.userID == user]
    return (temp.questionID.values,
            temp.result.values)

---

### My implementation with groupby

In [1]:
import pandas as pd
import dask.dataframe as dd

In [2]:
df = pd.DataFrame({'userID': ['1', '1', '2', '3'],
                   'questionID': ['a', 'b', 'b', 'c'],
                   'result': ['99', '100', '98', '97'],
                  })
df

Unnamed: 0,userID,questionID,result
0,1,a,99
1,1,b,100
2,2,b,98
3,3,c,97


In [3]:
ddf = dd.from_pandas(df, npartitions=1)
ddf

Unnamed: 0_level_0,userID,questionID,result
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,object,object,object
3,...,...,...


In [4]:
get_paths = dd.Aggregation(
    name = 'get_paths',
    chunk = lambda x: list(''.join(x)),
    agg = lambda x: x.sum(),
)

In [16]:
# pandas
df.groupby('userID').agg({'questionID':lambda x: list(''.join(x)), 'result':lambda x: list(''.join(x))})

In [5]:
# dask
ddf.groupby('userID').agg({'questionID':get_paths, 'result':get_paths})

ValueError: Metadata inference failed in `_groupby_apply_funcs`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
TypeError('sequence item 0: expected str instance, tuple found')

Traceback:
---------
  File "/Users/pavithra/mambaforge/envs/dask-community-demos/lib/python3.9/site-packages/dask/dataframe/utils.py", line 176, in raise_on_meta_error
    yield
  File "/Users/pavithra/mambaforge/envs/dask-community-demos/lib/python3.9/site-packages/dask/dataframe/core.py", line 5833, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/Users/pavithra/mambaforge/envs/dask-community-demos/lib/python3.9/site-packages/dask/dataframe/groupby.py", line 911, in _groupby_apply_funcs
    r = func(grouped, **func_kwargs)
  File "/Users/pavithra/mambaforge/envs/dask-community-demos/lib/python3.9/site-packages/dask/dataframe/groupby.py", line 957, in _apply_func_to_column
    return func(df_like[column])
  File "/var/folders/n6/th0cntvx43x3mvr1f768tygm0000gn/T/ipykernel_57466/2371008608.py", line 3, in <lambda>
    chunk = lambda x: list(''.join(x)),
