In [4]:
from __future__ import annotations
import seaborn
import pandas as pd
import typing

iris_df = seaborn.load_dataset("iris")
print(iris_df.shape)
print(iris_df.head())

(150, 5)
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [11]:
import dataclasses

@dataclasses.dataclass
class IrisEntry:
    sepal_length: float
    sepal_width: float
    species: str
    
    def sepal_area(self) -> float:
        return self.sepal_length * self.sepal_width
        
    @classmethod
    def from_series(cls, row: pd.Series):
        return cls(
            sepal_length = row['sepal_length'],
            sepal_width = row['sepal_width'],
            species = row['species'],
        )
    
class AverageIrisEntry(IrisEntry):
    pass
    
import multiprocessing
import statistics
import numpy as np
import typing
class Irises(typing.List[IrisEntry]):

    @classmethod
    def from_dataframe(cls, df: pd.DataFrame):
        irises = cls()
        for ind, row in iris_df.iterrows():
            irises.append(IrisEntry.from_series(row))
        return irises
    
    def average_by_species(self, do_width: bool = False) -> IrisEntry:
        avs = dict()
        for species, irises in self.group_by_species().items():
            avs[species] = AverageIrisEntry(
                sepal_length = statistics.mean([ie.sepal_length for ie in irises]),
                sepal_width = statistics.mean([ie.sepal_width for ie in irises]) if do_width else None,
                species = species,
            )
        return avs
    
    def group_by_species(self) -> typing.Dict[Species, Irises]:
        groups = dict()
        for ir in self:
            groups.setdefault(ir.species, self.__class__())
            groups[ir.species].append(ir)
        return groups
    
    def sepal_arrays(self) -> typing.Tuple[np.ndarray[np.float64], np.ndarray[np.float64]]:
        return np.array([ie.sepal_length for ie in self], dtype=np.float64), np.array([ie.sepal_width for ie in self], dtype=np.float64)
        
    def calc_areas_parallel(self, n_processes: int = 4) -> typing.List[IrisArea]:
        with multiprocessing.Pool(n_processes) as p:
            areas = p.map(self.calc_iris_area, self)
        return areas
irises = Irises.from_dataframe(iris_df)
len(irises)

150

In [9]:
long_iris_df = pd.concat([iris_df for _ in range(10000)])
irises_long = Irises.from_dataframe(long_iris_df)
print(long_iris_df.shape, len(irises_long))

%timeit irises_long.average_by_species()
%timeit iris_df.groupby('species')['sepal_length'].sum()

%timeit irises_long.average_by_species(True)
%timeit iris_df.groupby('species').agg({'sepal_length':"sum", 'sepal_width': "sum"})

(1500000, 5) 150
209 µs ± 1.36 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
478 µs ± 3.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
393 µs ± 15.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.27 ms ± 7.43 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%timeit l, w = irises_long.sepal_arrays()
%timeit l * w
%timeit long_iris_df['sepal_length'] * long_iris_df['sepal_length']

29.1 µs ± 16.5 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
788 ns ± 3.51 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
1.65 ms ± 75.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
import numpy as np
import dataclasses

@dataclasses.dataclass
class IrisArray:
    sepal_length: np.ndarray[np.float64]
    sepal_width: np.ndarray[np.float64]
    species: np.ndarray[str]
    
    @classmethod
    def from_df(cls, df: pd.DataFrame):
        return cls(
            sepal_length = df['sepal_length'],
            sepal_width = df['sepal_width'],
            species = df['species'],
        )
    
    def filter_sepal_length(self, min_sepal_length: float) -> IrisArray:
        ind = self.sepal_length >= min_sepal_length
        return self.__class__(
            sepal_length = self.sepal_length[ind],
            sepal_width = self.sepal_width[ind],
            species = self.species[ind],
        )
    
irises = IrisArray.from_df(iris_df).filter_sepal_length(5.0)
irises

IrisArray(sepal_length=0      5.1
4      5.0
5      5.4
7      5.0
10     5.4
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 128, dtype: float64, sepal_width=0      3.5
4      3.6
5      3.9
7      3.4
10     3.7
      ... 
145    3.0
146    2.5
147    3.0
148    3.4
149    3.0
Name: sepal_width, Length: 128, dtype: float64, species=0         setosa
4         setosa
5         setosa
7         setosa
10        setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 128, dtype: object)