In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from numpy.testing import assert_array_almost_equal
import statsmodels.api as sm
from line_profiler import LineProfiler


## Generating some data

In [2]:
def generate_data(seed=1):
    """ Generate random gaussian data with a given seed """
    np.random.seed(seed)
    n_cols = 50
    n_rows = 64
    random_data = np.random.normal(size=n_cols * n_rows, scale=4)
    random_data = random_data.reshape(n_rows, n_cols)
    return pd.DataFrame(random_data)

df = generate_data()
print(df.shape)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,6.497381,-2.447026,-2.112687,-4.291874,3.461631,-9.206155,6.979247,-3.044828,1.276156,-0.997482,...,-0.767342,-3.550516,-2.988633,6.769818,0.203231,-2.547983,0.763662,8.401021,0.480636,2.468812
1,1.200681,-1.408999,-4.570073,-1.397371,-0.835577,2.346493,3.355934,3.724408,1.142349,3.540565,...,4.795672,0.740626,-1.501140,-2.554922,1.693977,0.309360,-1.375415,0.174387,-2.480003,2.792128
2,-1.788514,4.898031,1.613967,2.374314,-4.379647,0.677530,2.962226,-3.814802,-1.064874,0.130458,...,-0.746279,-0.406983,3.475545,3.001647,2.117861,0.550805,0.311285,2.473521,0.929978,2.730206
3,-1.240467,-9.739351,4.155298,8.747919,1.765458,-0.400621,-0.545779,-0.476217,0.069638,-4.488075,...,4.641354,1.477971,7.618635,4.444227,2.636199,-6.509753,2.409277,1.681129,3.243807,4.177768
4,-1.603513,3.296022,-2.249222,7.819512,-5.327807,-7.042754,-6.602885,-3.562222,-4.476462,7.824316,...,-1.992854,-1.243940,-0.007566,-5.586482,-3.445265,2.698846,2.474157,-1.772688,7.242140,-5.222908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,-0.646016,-7.401997,1.099890,2.913858,-2.120244,1.191432,-9.995577,-2.125582,-5.349606,3.701170,...,2.739278,4.709818,0.660340,0.932451,-7.210888,-3.087562,-2.490626,-0.602638,-5.600092,-5.204264
60,-0.308428,0.831301,3.944784,5.731026,2.113034,-1.470928,2.766883,-3.193387,0.855271,7.159904,...,-2.977891,6.334189,0.238889,-0.208896,-1.005461,-1.571929,0.755117,-1.284575,12.953373,-4.217500
61,0.468973,-1.881093,-2.810162,-2.671754,0.110620,3.724779,4.778026,2.786566,-1.694727,7.686953,...,5.023445,2.635732,-4.577423,1.938619,1.258080,6.443526,4.223474,-4.473279,-3.108701,3.977740
62,-5.286054,2.399608,-2.713392,2.035780,2.153059,1.498135,-2.987269,0.134151,4.880890,-0.738399,...,0.787383,-1.883373,-2.118701,-0.864703,6.362227,2.172455,5.508150,-5.056844,-1.948569,0.278514


In [28]:
def ols_lstsq(row):
    lenght_x = row.shape[0]
    X = np.arange(lenght_x)
    ones = np.ones(lenght_x)
    X = np.vstack((X, ones)).T
    slope, intercept = np.linalg.lstsq(X, row.values, rcond=-1)[0]
    return slope


def ols_sklearn(row):
    model = LinearRegression()
    lenght_X = row.shape[0]
    X = np.arange(lenght_X).reshape(-1, 1)
    model.fit(X, row.values)
    slope = model.coef_[0]
    return slope


def ols_sm(row):
    lenght_x = row.shape[0]
    X = np.arange(lenght_x)
    X = sm.add_constant(X)
    Y = row.values
    model = sm.OLS(Y, X)
    model_results = model.fit()
    intercept, slope = model_results.params.squeeze()
    return slope

In [29]:
%%timeit
results_lstsq = df.apply(ols_lstsq, axis=1)

7.35 s ± 34.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
%%timeit
results_sklearn = df.apply(ols_sklearn, axis=1)

KeyboardInterrupt: 

In [None]:
%%timeit
results_sm = df.apply(ols_sm, axis=1)

In [None]:
results_lstsq = df.apply(ols_lstsq, axis=1)
results_sklearn = df.apply(ols_sklearn, axis=1)
results_sm = df.apply(ols_sm, axis=1)

In [None]:
assert_array_almost_equal(results_sklearn, results_lstsq)
assert_array_almost_equal(results_sklearn, results_sm)

In [None]:
# lp = LineProfiler()
# model = LinearRegression()
# lp.run("model.fit(X, row.values)")

In [None]:
%%timeit
slopes = []
for row_idx in range(df.shape[0]):
    row = df.iloc[row_idx]
    slope = ols_lstsq(row)
    slopes.append(slope)
reults = pd.Series(slopes)


In [None]:
%%timeit
slopes_apply = df.apply(ols_lstsq, axis=1)
# results = pd.Series(slopes_apply)

In [None]:
def ols_lstsq_raw(row):
    lenght_x = row.shape[0]
    X = np.arange(lenght_x)
    ones = np.ones(lenght_x)
    X = np.vstack((X, ones)).T
    slope, intercept = np.linalg.lstsq(X, row, rcond=-1)[0]
    return slope

In [None]:
%%timeit
slopes_apply = df.apply(ols_lstsq_raw, axis=1, raw=True)

In [None]:
import numba

@numba.jit(nopython=True)
def ols_lstsq_raw_numba(row):
    """ """
    lenght_x = row.shape[0]
    X = np.arange(lenght_x)
    ones = np.ones(lenght_x)
    X = np.vstack((X, ones)).T
    slope, intercept = np.linalg.lstsq(X, row, rcond=-1)[0]
    return slope

# this is for precompiling
_ = df.iloc[:1].apply(ols_lstsq_raw_numba, axis=1, raw=True)

In [None]:
%%timeit
slopes_apply_raw_numba = df.apply(ols_lstsq_raw_numba, axis=1, raw=True)

### Dask

- Pandas and Numpy distributed computing
- Bag (standard Python collections), Array(NumPy) and Distributed DataFrame (Pandas)
- Super-easy parallelised Pandas functions

Dask official documentation: https://docs.dask.org/en/latest/dataframe.html

In [None]:
import dask.dataframe as dd

In [None]:
%%timeit
N_PARTITIONS = 16
SCHEDULER = "processes"
ddf = dd.from_pandas(df, npartitions=N_PARTITIONS, sort=False)
slopes_apply_raw_numba_and_dask = ddf.apply(
    ols_lstsq_raw_numba,
    axis=1,
    meta=(None, 'float64',),
    raw=True,
).compute(scheduler=SCHEDULER)

## Trying with more data

In [None]:
# this is for precompiling
_ = df.iloc[:1].apply(ols_lstsq_raw_numba, axis=1, raw=True)

In [None]:
# real data
df = pd.read_csv("wikipedia_train.csv")
df = df.drop("Page", axis=1)
df = df.fillna(df.median().median())
df.shape

In [None]:
%%timeit
results_lstsq = df.apply(ols_lstsq, axis=1)

In [None]:
%%timeit
N_PARTITIONS = 16
SCHEDULER = "processes"
ddf = dd.from_pandas(df, npartitions=N_PARTITIONS, sort=False)
slopes_apply_raw_numba_and_dask = ddf.apply(
    ols_lstsq_raw_numba,
    axis=1,
    meta=(None, 'float64',),
    raw=True,
).compute(scheduler=SCHEDULER)