In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from numpy.testing import assert_array_almost_equal
import statsmodels.api as sm
from line_profiler import LineProfiler


## Generating some data

In [None]:
def generate_grouped_data(seed=1):
    """ Generate random gaussian data with a given seed """
    np.random.seed(seed)
    n_cols = 1
    n_rows = 2**11
    random_data = np.random.normal(size=n_cols * n_rows, scale=4)
    random_data = random_data.reshape(n_rows, n_cols)
    random_df = pd.DataFrame(random_data)
    random_df['group'] = np.arange(len(random_df)) % 15
    return random_df

df = generate_grouped_data()
df['group'].value_counts()


In [None]:
def ols_groupby_lstsq(series):
    lenght_x = series.shape[0]
    X = np.arange(lenght_x)
    ones = np.ones(lenght_x)
    X = np.vstack((X, ones)).T
    slope, intercept = np.linalg.lstsq(X, series, rcond=-1)[0]
    return slope

In [None]:
%%timeit
slopes_by_group = df.groupby('group')[0].transform(ols_groupby_lstsq)
slopes_by_group

In [None]:
import numba

@numba.jit(nopython=True)
def ols_groupby_lstsq_numba(series):
    lenght_x = series.shape[0]
    X = np.arange(lenght_x)
    ones = np.ones(lenght_x)
    X = np.vstack((X, ones)).T
    slope, intercept = np.linalg.lstsq(X, series.values, rcond=-1)[0]
    return slope

In [None]:
# %%timeit
# slopes_by_group = df.groupby('group')[0].transform(ols_groupby_lstsq_numba)
# slopes_by_group

In [None]:
from collections import defaultdict, OrderedDict

# @numba.jit(nopython=True)
def get_group_ixs(ids):
    id_hash = defaultdict(list)
    for j, key in enumerate(ids):
        id_hash[key].append(j)
    id_hash = {k: np.array(v) for k, v in id_hash.items()}
    return id_hash

def group_apply(values, group_ids, func):
    output = np.repeat(np.nan, len(values))
    ixs = get_group_ixs(group_ids)
    for ix in ixs.values():
        output[ix] = func(values[ix])
    return output

In [None]:
df.iloc[:1][0].values

In [None]:
# @numba.jit(nopython=True)
def ols_group_ixs(array):
    lenght_x = array.shape[0]
    X = np.arange(lenght_x)
    ones = np.ones(lenght_x)
    X = np.vstack((X, ones)).T
    slope = np.linalg.lstsq(X, array, rcond=-1)[0][0]
    return slope


results_manual_group_apply = group_apply(df.iloc[:1]['group'].values, df.iloc[:1][0].values, ols_group_ixs)

In [None]:
%%timeit
results_sm = group_apply(df['group'].values, df[0].values, ols_group_ixs)

### Dask

- Pandas and Numpy distributed computing
- Bag (standard Python collections), Array(NumPy) and Distributed DataFrame (Pandas)
- Super-easy parallelised Pandas functions

Dask official documentation: https://docs.dask.org/en/latest/dataframe.html

In [None]:
import dask.dataframe as dd



In [None]:
%%timeit
N_PARTITIONS = 16
SCHEDULER = "processes"
ddf = dd.from_pandas(df, npartitions=N_PARTITIONS, sort=False)
slopes = ddf.groupby("group")[0].transform(
    ols_groupby_lstsq,
    axis=1,
    meta=(None, 'float64'),
    raw=True,
).compute(scheduler=SCHEDULER)