In [37]:
import pandas as pd
import numpy as np
from functools import reduce
import itertools

In [38]:
df = pd.read_csv('data/TestData.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
Date             2000 non-null datetime64[ns]
Hour             2000 non-null int64
ExternalId       2000 non-null object
Domain           2000 non-null object
PlacementId      2000 non-null int64
Placements       2000 non-null int64
Impressions      2000 non-null int64
FillRate         2000 non-null float64
BuyCpm           2000 non-null float64
SellCpm          2000 non-null float64
ApbCpm           2000 non-null float64
Margin           2000 non-null float64
MarginPercent    2000 non-null float64
dtypes: datetime64[ns](1), float64(6), int64(4), object(2)
memory usage: 203.2+ KB


## Reduce Dataframe by specific column

In [39]:
df.head()

Unnamed: 0,Date,Hour,ExternalId,Domain,PlacementId,Placements,Impressions,FillRate,BuyCpm,SellCpm,ApbCpm,Margin,MarginPercent
0,2016-08-01 04:00:00,4,BC-US-8|13807760,myfox8.com,5410238,2,0,0.0,4.502,2.754,0.001,-0.009,0.0
1,2016-07-31 23:00:00,23,BC-US-8|13807760,hotnewhiphop.com,506724,0,1,0.0,0.0,2.871,0.001,0.0,0.0
2,2016-07-31 23:00:00,23,BC-US-8|13807760,drudgereport.com,6340489,1,0,0.0,1.32,3.366,0.001,-0.001,0.0
3,2016-07-31 16:00:00,16,BC-US-8|13807760,bloodyelbow.com,3640217,0,1,0.0,0.0,4.842,0.001,0.0,0.0
4,2016-07-31 02:00:00,2,BC-US-14|13859978,zillow.com,6397031,3,0,0.0,1.2,1.863,2.012,-0.004,0.0


In [40]:
class DataFrameReducer:
    def __init__(self, df, index_label):
        self.df = df
        self.index_label = index_label
        self.__columns = [index_label]
        self.__attempt_flag = False
        self.result = None
        
    def drop_columns(self, labels, axis=1):
        self.df = self.df.drop(labels=labels, axis=axis)
    
    def reduce_by_func(self, label, func=sum):
        """ Set func (i.e. sum or np.mean)
        """
        self.__columns.append(label)
        
        lst = [(self.df[i:i+1][self.index_label].values[0], self.df[i:i+1][label].values[0]) for i in range(len(self.df))]
        lst = np.array([(k, func(list(e for _, e in list(v)))) for k, v in itertools.groupby(sorted(lst), key=lambda x : x[0])])
        if self.__attempt_flag:
            lst = np.delete(lst, np.s_[0:1], axis=1)
            self.result = np.append(self.result, lst, axis=1)
        else:
            self.__attempt_flag = True
            self.result = lst
        return pd.DataFrame(self.result, columns=self.__columns)
    
    def to_df(self):
        return pd.DataFrame(self.result, columns=self.__columns)

In [41]:
reduced = DataFrameReducer(df, 'Date')

In [42]:
reduced.reduce_by_func('BuyCpm', np.mean)
reduced.reduce_by_func('SellCpm', np.mean)
reduced.reduce_by_func('ApbCpm', np.mean)
reduced.reduce_by_func('Placements', sum)
reduced.reduce_by_func('Impressions', sum)
reduced.reduce_by_func('Margin', sum)

df1 = reduced.to_df()

In [46]:
print('Dataframe size %sx%s' % df1.shape)
df1.head()

Dataframe size 31x7


Unnamed: 0,Date,BuyCpm,SellCpm,ApbCpm,Placements,Impressions,Margin
0,2016-07-28T15:00:00.000000000,0.999672,4.61092,0.0186481,108092,22177,40.56
1,2016-07-28T16:00:00.000000000,0.688829,5.71346,0.129743,68,70,0.063
2,2016-07-28T17:00:00.000000000,0.531364,5.66468,0.001,19,20,0.01
3,2016-07-28T18:00:00.000000000,0.704125,5.1165,0.001,35,18,0.035
4,2016-07-28T19:00:00.000000000,0.527227,5.60864,0.001,27,15,-0.015


In [47]:
reduced = DataFrameReducer(df, 'Domain')

In [48]:
reduced.reduce_by_func('BuyCpm', np.mean)
reduced.reduce_by_func('SellCpm', np.mean)
reduced.reduce_by_func('ApbCpm', np.mean)
reduced.reduce_by_func('Placements', sum)
reduced.reduce_by_func('Impressions', sum)
reduced.reduce_by_func('Margin', sum)

df2 = reduced.to_df()

In [49]:
print('Dataframe size %sx%s' % df2.shape)
df2.head()

Dataframe size 1402x7


Unnamed: 0,Domain,BuyCpm,SellCpm,ApbCpm,Placements,Impressions,Margin
0,ads.proboards.com,0.9205,6.075,0.001,23,1,-0.028
1,allrecipes.com,0.0,3.879,0.001,0,2,0.0
2,america.aljazeera.com,0.771333333333,6.048,0.001,2,1,-0.002
3,amyshealthybaking.com,1.262,5.607,0.001,1,0,-0.001
4,bet.com,0.0,6.705,0.001,0,22,0.0
