## [[Discourse] AttributeError: 'DataFrame' object has no attribute 'repartition' #207](https://github.com/coiled/dask-community/issues/207)

User is trying to do: read data --> filter --> convert some cols to tensors

### Original with code with toy data

In [1]:
import dask
# import torch

import pandas as pd
import dask.dataframe as dd

In [2]:
df = pd.DataFrame({'userID': ['1', '1', '2', '3'], 'questionID': ['a', 'b', 'b', 'c'], 'result': ['99', '100', '98', '97']})
df

Unnamed: 0,userID,questionID,result
0,1,a,99
1,1,b,100
2,2,b,98
3,3,c,97


In [3]:
ddf = dd.from_pandas(df, npartitions=2)

In [5]:
MAX_LEN = 300
users = df.userID.unique()
paths = []
labels = []

In [6]:
# @dask.delayed(nout=2)
def get_paths(df, user):
    temp = df.loc[df.userID == user]
    npart = round(len(temp)/MAX_LEN)
    if npart == 0:
        return (temp.questionID.values, temp.result.values)
    else:
        parted_df = temp.repartition(npartitions=npart)
        return (parted_df.partitions[0].questionID.values,
                parted_df.partitions[0].result.values)

In [7]:
# @dask.delayed
def convert_to_tensor(x):
    # return torch.tensor(x)
    return x

In [8]:
for user in users:
    path, label = get_paths(df, user)
    paths.append(convert_to_tensor(path))
    labels.append(convert_to_tensor(label))

In [None]:
dask.compute(*paths)

In [9]:
dask.compute(*labels)

In [9]:
paths

[array(['a', 'b'], dtype=object),
 array(['b'], dtype=object),
 array(['c'], dtype=object)]

In [10]:
labels

[array(['99', '100'], dtype=object),
 array(['98'], dtype=object),
 array(['97'], dtype=object)]

---

### User's updated code

In [None]:
trunc_df = pd.DataFrame()
for user in tqdm(df.userID.unique()):
    user_df = df.loc[df.userID == user].iloc[-MAX_LEN:]
    trunc_df = pd.concat([trunc_df, user_df])

In [None]:
@dask.delayed(nout=2)
def get_paths(df, user):
    temp = df.loc[df.userID == user]
    return (temp.questionID.values,
            temp.result.values)

---

### Our implementation with groupby

In [28]:
import numpy as np
import pandas as pd
import dask.dataframe as dd

df = pd.DataFrame({'userID': np.repeat(np.array([1,2,3]), 400),
                   'questionID': np.arange(0, 1200),
                   'result': np.arange(1200, 2400),
                  })

ddf = dd.from_pandas(df, npartitions=2)

ddf.groupby('userID').apply(lambda x: x[['questionID', 'result']].head(300)).compute()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  ddf.groupby('userID').apply(lambda x: x[['questionID', 'result']].head(300)).compute()


Unnamed: 0_level_0,Unnamed: 1_level_0,questionID,result
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,0,1200
1,1,1,1201
1,2,2,1202
1,3,3,1203
1,4,4,1204
...,...,...,...
3,1095,1095,2295
3,1096,1096,2296
3,1097,1097,2297
3,1098,1098,2298
