In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

import helper
import dataloader

In [3]:
query = """
        SELECT
            MarketName, 
            Booking, 
            DateOfBirth, 
            DateBooked, 
            WeekBooked, 
            isnull(ws.SortOrder,13) as SortOrder,
            ProgramDuration,
            DateCaxed, 
            FiscalYear, 
            ProgramStartDate, 
            SalesRepID, 
            u.Name as SalesRepName, 
            OriginalRanking, 
            MethodOfCreation,
            hasBooked =1,
            hasCaxed = case when DateCaxed is null then 0 else 1 end
        from sales s
        join Users u on u.User_id = s.SalesRepID
        left join WeekSort ws on ws.WeekNum = right(s.WeekBooked,2)
        """

In [4]:
df_original = helper.get_dataframe_from_sqlserver_query(query=query)


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



In [5]:
df_original.head()

Unnamed: 0,MarketName,Booking,DateOfBirth,DateBooked,WeekBooked,SortOrder,ProgramDuration,DateCaxed,FiscalYear,ProgramStartDate,SalesRepID,SalesRepName,OriginalRanking,MethodOfCreation,hasBooked,hasCaxed
0,China,CN_198974,2004-05-06,2020-06-24 21:23:37,2026,39,9.0,NaT,2020,2020-06-29,892,Malvina Belgrano,6,Website,1,0
1,Japan,JP_180044,2004-12-15,2020-07-29 20:26:36,2031,44,4.0,NaT,2020,2020-08-03,892,Malvina Belgrano,1,EnteredByUser,1,0
2,South Korea,KR_129434,2004-01-06,2020-06-24 18:52:08,2026,39,9.0,NaT,2020,2020-06-29,892,Malvina Belgrano,1,EnteredByUser,1,0
3,Vietnam,VN_133204,2004-06-06,2020-07-29 18:04:08,2031,44,4.0,NaT,2020,2020-08-03,892,Malvina Belgrano,4,EnteredByUser,1,0
4,Vietnam,VN_133206,2004-01-31,2020-07-29 18:18:49,2031,44,4.0,NaT,2020,2020-08-03,892,Malvina Belgrano,4,EnteredByUser,1,0


In [6]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34705 entries, 0 to 34704
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   MarketName        34705 non-null  object        
 1   Booking           34705 non-null  object        
 2   DateOfBirth       34703 non-null  datetime64[ns]
 3   DateBooked        34705 non-null  datetime64[ns]
 4   WeekBooked        34705 non-null  int64         
 5   SortOrder         34705 non-null  int64         
 6   ProgramDuration   34704 non-null  float64       
 7   DateCaxed         10320 non-null  datetime64[ns]
 8   FiscalYear        34705 non-null  int64         
 9   ProgramStartDate  34601 non-null  datetime64[ns]
 10  SalesRepID        34705 non-null  int64         
 11  SalesRepName      34705 non-null  object        
 12  OriginalRanking   34705 non-null  int64         
 13  MethodOfCreation  34701 non-null  object        
 14  hasBooked         3470

### Pre-Processing

In [7]:

df_bookings = (df_original
               .assign(
                       ProgramDuration = lambda _df: _df['ProgramDuration'].fillna(_df['ProgramDuration'].median()).astype('int16'),
                       DaysToCAX = lambda _df: (_df['DateCaxed'] - _df['DateBooked']).dt.days.astype('Int64'),
                       WeekNum = lambda _df: _df['WeekBooked'].astype('str').str[-2:].astype('int8'),
                       hasCaxed = lambda _df: _df['hasCaxed'].astype('bool'),
                       hasBooked = lambda _df: _df['hasBooked'].astype('bool'),
                )
               .rename(columns={'Booking': 'BookingId'})
)


In [8]:
df_bookings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34705 entries, 0 to 34704
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   MarketName        34705 non-null  object        
 1   BookingId         34705 non-null  object        
 2   DateOfBirth       34703 non-null  datetime64[ns]
 3   DateBooked        34705 non-null  datetime64[ns]
 4   WeekBooked        34705 non-null  int64         
 5   SortOrder         34705 non-null  int64         
 6   ProgramDuration   34705 non-null  int16         
 7   DateCaxed         10320 non-null  datetime64[ns]
 8   FiscalYear        34705 non-null  int64         
 9   ProgramStartDate  34601 non-null  datetime64[ns]
 10  SalesRepID        34705 non-null  int64         
 11  SalesRepName      34705 non-null  object        
 12  OriginalRanking   34705 non-null  int64         
 13  MethodOfCreation  34701 non-null  object        
 14  hasBooked         3470

In [1]:
data = df_bookings.groupby(['MarketName','SalesRepName'])[['BookingId', 'ProgramDuration']].agg({'BookingId':'count', 'ProgramDuration':'sum'}).reset_index()
fig = px.scatter(data, x="BookingId", y="ProgramDuration", color="MarketName", opacity= 0.8, size='ProgramDuration')
fig.show()

NameError: name 'df_bookings' is not defined

In [9]:
df_bookings.columns

Index(['MarketName', 'BookingId', 'DateOfBirth', 'DateBooked', 'WeekBooked',
       'SortOrder', 'ProgramDuration', 'DateCaxed', 'FiscalYear',
       'ProgramStartDate', 'SalesRepID', 'SalesRepName', 'OriginalRanking',
       'MethodOfCreation', 'hasBooked', 'hasCaxed', 'DaysToCAX', 'WeekNum'],
      dtype='object')

In [11]:
df_bookings[df_bookings['DateCaxed'].isna()]

Unnamed: 0,MarketName,BookingId,DateOfBirth,DateBooked,WeekBooked,SortOrder,ProgramDuration,DateCaxed,FiscalYear,ProgramStartDate,SalesRepID,SalesRepName,OriginalRanking,MethodOfCreation,hasBooked,hasCaxed,DaysToCAX,WeekNum
0,China,CN_198974,2004-05-06,2020-06-24 21:23:37,2026,39,9,NaT,2020,2020-06-29,892,Malvina Belgrano,6,Website,True,False,,26
1,Japan,JP_180044,2004-12-15,2020-07-29 20:26:36,2031,44,4,NaT,2020,2020-08-03,892,Malvina Belgrano,1,EnteredByUser,True,False,,31
2,South Korea,KR_129434,2004-01-06,2020-06-24 18:52:08,2026,39,9,NaT,2020,2020-06-29,892,Malvina Belgrano,1,EnteredByUser,True,False,,26
3,Vietnam,VN_133204,2004-06-06,2020-07-29 18:04:08,2031,44,4,NaT,2020,2020-08-03,892,Malvina Belgrano,4,EnteredByUser,True,False,,31
4,Vietnam,VN_133206,2004-01-31,2020-07-29 18:18:49,2031,44,4,NaT,2020,2020-08-03,892,Malvina Belgrano,4,EnteredByUser,True,False,,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34700,Hong Kong,HK_137037,2004-09-12,2021-09-01 17:08:59,2135,48,35,NaT,2021,2022-09-19,69130,Christy Tam,1,EnteredByUser,True,False,,35
34701,Hong Kong,HK_137039,2004-04-14,2021-09-04 17:36:27,2136,49,39,NaT,2021,2021-09-20,69130,Christy Tam,8,Website,True,False,,36
34702,Hong Kong,HK_137016,2003-12-14,2021-08-03 17:32:04,2131,44,50,NaT,2021,2021-09-20,69130,Christy Tam,6,Website,True,False,,31
34703,Hong Kong,HK_137046,2004-08-22,2021-09-18 18:10:07,2138,51,77,NaT,2021,2022-09-19,69300,Shirley Ling,5,Website,True,False,,38


In [23]:
df_bookings.columns

Index(['MarketName', 'BookingId', 'DateOfBirth', 'DateBooked', 'WeekBooked',
       'SortOrder', 'ProgramDuration', 'DateCaxed', 'FiscalYear',
       'ProgramStartDate', 'SalesRepID', 'SalesRepName', 'OriginalRanking',
       'MethodOfCreation', 'hasBooked', 'hasCaxed', 'DaysToCAX', 'WeekNum'],
      dtype='object')

In [196]:
df_booked = (df_bookings
    .rename(columns={'DateBooked': 'MainDate'})
    .assign(
        MainDate = lambda _df: _df['MainDate'].dt.normalize(),
        RankBin = lambda _df: pd.cut(df_bookings['OriginalRanking'], bins = [0,4,5,10], labels=['1-4', '5', '6-10'],),
    )
    .drop(['OriginalRanking'], axis = 1)
    
)

In [197]:
df_booked= pd.get_dummies(df_booked, columns=['MethodOfCreation', 'RankBin'], prefix = {'MethodOfCreation':'', 'RankBin':'Rank'}, prefix_sep = '')

In [198]:
df_booked

Unnamed: 0,MarketName,BookingId,DateOfBirth,MainDate,WeekBooked,SortOrder,ProgramDuration,DateCaxed,FiscalYear,ProgramStartDate,...,hasBooked,hasCaxed,DaysToCAX,WeekNum,EnteredByUser,Excel Import,Website,Rank1-4,Rank5,Rank6-10
0,China,CN_198974,2004-05-06,2020-06-24,2026,39,9,NaT,2020,2020-06-29,...,True,False,,26,0,0,1,0,0,1
1,Japan,JP_180044,2004-12-15,2020-07-29,2031,44,4,NaT,2020,2020-08-03,...,True,False,,31,1,0,0,1,0,0
2,South Korea,KR_129434,2004-01-06,2020-06-24,2026,39,9,NaT,2020,2020-06-29,...,True,False,,26,1,0,0,1,0,0
3,Vietnam,VN_133204,2004-06-06,2020-07-29,2031,44,4,NaT,2020,2020-08-03,...,True,False,,31,1,0,0,1,0,0
4,Vietnam,VN_133206,2004-01-31,2020-07-29,2031,44,4,NaT,2020,2020-08-03,...,True,False,,31,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34700,Hong Kong,HK_137037,2004-09-12,2021-09-01,2135,48,35,NaT,2021,2022-09-19,...,True,False,,35,1,0,0,1,0,0
34701,Hong Kong,HK_137039,2004-04-14,2021-09-04,2136,49,39,NaT,2021,2021-09-20,...,True,False,,36,0,0,1,0,0,1
34702,Hong Kong,HK_137016,2003-12-14,2021-08-03,2131,44,50,NaT,2021,2021-09-20,...,True,False,,31,0,0,1,0,0,1
34703,Hong Kong,HK_137046,2004-08-22,2021-09-18,2138,51,77,NaT,2021,2022-09-19,...,True,False,,38,0,0,1,0,1,0


In [199]:
df_booked = df_booked.groupby(['MainDate', 'MarketName', 'SalesRepID', 'SalesRepName', 'WeekBooked', 'WeekNum',
       'SortOrder','FiscalYear',
       ])[['hasBooked', 'hasCaxed', 'ProgramDuration', 'EnteredByUser', 'Excel Import', 'Website', 'Rank1-4', 'Rank5','Rank6-10']].sum()

In [200]:
df_booked

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,hasBooked,hasCaxed,ProgramDuration,EnteredByUser,Excel Import,Website,Rank1-4,Rank5,Rank6-10
MainDate,MarketName,SalesRepID,SalesRepName,WeekBooked,WeekNum,SortOrder,FiscalYear,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-09-28,China,1997,Christy Liu,1840,40,1,2019,2,2,18,1,0,1,2,0,0
2018-09-28,China,50978,Qian Liu,1840,40,1,2019,1,1,8,1,0,0,1,0,0
2018-09-28,China,51158,Bonnie Yin,1840,40,1,2019,2,0,5,2,0,0,2,0,0
2018-09-28,China,51244,Cuicui Zhai,1840,40,1,2019,2,2,105,2,0,0,2,0,0
2018-09-28,China,53397,Ines Wang,1840,40,1,2019,4,3,10,3,0,1,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-29,Japan,64660,Maya Ichikawa,2139,39,52,2021,2,0,12,1,0,1,2,0,0
2021-09-29,Japan,67577,Ryoichi Yamamoto,2139,39,52,2021,1,0,6,0,0,1,1,0,0
2021-09-29,Japan,68123,Shu Nozawa,2139,39,52,2021,1,0,22,1,0,0,1,0,0
2021-09-29,South Korea,57612,Eric Park,2139,39,52,2021,1,0,34,1,0,0,1,0,0


In [201]:
df_booked[['Rank5']].sum()

Rank5    3538
dtype: int64

In [202]:
df_caxed = (df_bookings[df_bookings['hasCaxed']==True]
    .rename(columns={'DateBooked': 'MainDate'})
    .assign(
        MainDate = lambda _df: _df['MainDate'].dt.normalize(),
        RankBin = lambda _df: pd.cut(df_bookings['OriginalRanking'], bins = [0,4,5,10], labels=['1-4', '5', '6-10'],),
    )
    .drop(['OriginalRanking'], axis = 1)
    
)

In [203]:
df_caxed= pd.get_dummies(df_caxed, columns=['MethodOfCreation', 'RankBin'], prefix = {'MethodOfCreation':'', 'RankBin':'Rank'}, prefix_sep = '')

In [204]:
df_caxed[['Rank5']].sum()

Rank5    951
dtype: int64

In [205]:
df_caxed = df_caxed.groupby(['MainDate', 'MarketName', 'SalesRepID', 'SalesRepName', 'WeekBooked', 'WeekNum',
       'SortOrder','FiscalYear',
       ])[['hasBooked', 'hasCaxed', 'ProgramDuration', 'EnteredByUser', 'Excel Import', 'Website', 'Rank1-4', 'Rank5','Rank6-10']].sum()

In [206]:
df_caxed.drop(['hasBooked', 'hasCaxed'], axis=1, inplace=True)

In [207]:
df_caxed

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,ProgramDuration,EnteredByUser,Excel Import,Website,Rank1-4,Rank5,Rank6-10
MainDate,MarketName,SalesRepID,SalesRepName,WeekBooked,WeekNum,SortOrder,FiscalYear,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-09-28,China,1997,Christy Liu,1840,40,1,2019,18,1,0,1,2,0,0
2018-09-28,China,50978,Qian Liu,1840,40,1,2019,8,1,0,0,1,0,0
2018-09-28,China,51244,Cuicui Zhai,1840,40,1,2019,105,2,0,0,2,0,0
2018-09-28,China,53397,Ines Wang,1840,40,1,2019,8,2,0,1,2,0,1
2018-09-28,China,57068,Jeremy Wanghz,1840,40,1,2019,12,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-23,Japan,64620,Yumi Ono,2138,38,51,2021,12,0,0,1,1,0,0
2021-09-23,South Korea,51766,Jimin Kang,2138,38,51,2021,26,1,0,0,1,0,0
2021-09-23,South Korea,57611,Sunjeong Lee,2138,38,51,2021,68,1,0,0,1,0,0
2021-09-24,China,57957,Doris Han,2139,39,52,2021,62,2,0,0,2,0,0


In [223]:
df_merged = (df_booked #.set_index(['MainDate',	'MarketName',	'SalesRepID'])
.merge (df_caxed #.set_index(['MainDate',	'MarketName',	'SalesRepID'])
       , left_index=True, right_index=True, how='left', suffixes=('', '_cax'))
.assign(
    # **{col : lambda _df: _df[col].fillna(0).astype('int16') 
            #    for col in ['ProgramDuration_cax', 'EnteredByUser_cax', 'Excel Import_cax', 'Website_cax']}
    ProgramDuration_cax = lambda _df: _df['ProgramDuration_cax'].fillna(0).astype('int16'),
    EnteredByUser_cax = lambda _df: _df['EnteredByUser_cax'].fillna(0).astype('int16'),      
    **{'Excel Import_cax' : lambda _df: _df['Excel Import_cax'].fillna(0).astype('int16')},    
    Website_cax = lambda _df: _df['Website_cax'].fillna(0).astype('int16'),
    **{'Rank1-4_cax' : lambda _df: _df['Rank1-4_cax'].fillna(0).astype('int16')},
    **{'Rank5_cax' : lambda _df: _df['Rank5_cax'].fillna(0).astype('int16')},
    **{'Rank6-10_cax' : lambda _df: _df['Rank6-10_cax'].fillna(0).astype('int16')},   
)
.rename(columns={'Excel Import_cax': 'ExcelImport_cax', 'Excel Import': 'ExcelImport'})
).reset_index()

In [225]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20112 entries, 0 to 20111
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   MainDate             20112 non-null  datetime64[ns]
 1   MarketName           20112 non-null  object        
 2   SalesRepID           20112 non-null  int64         
 3   SalesRepName         20112 non-null  object        
 4   WeekBooked           20112 non-null  int64         
 5   WeekNum              20112 non-null  int64         
 6   SortOrder            20112 non-null  int64         
 7   FiscalYear           20112 non-null  int64         
 8   hasBooked            20112 non-null  int64         
 9   hasCaxed             20112 non-null  int64         
 10  ProgramDuration      20112 non-null  int16         
 11  EnteredByUser        20112 non-null  uint8         
 12  Excel Import         20112 non-null  uint8         
 13  Website              20112 non-

In [226]:
df_merged

Unnamed: 0,MainDate,MarketName,SalesRepID,SalesRepName,WeekBooked,WeekNum,SortOrder,FiscalYear,hasBooked,hasCaxed,...,Rank1-4,Rank5,Rank6-10,ProgramDuration_cax,EnteredByUser_cax,Excel Import_cax,Website_cax,Rank1-4_cax,Rank5_cax,Rank6-10_cax
0,2018-09-28,China,1997,Christy Liu,1840,40,1,2019,2,2,...,2,0,0,18,1,0,1,2,0,0
1,2018-09-28,China,50978,Qian Liu,1840,40,1,2019,1,1,...,1,0,0,8,1,0,0,1,0,0
2,2018-09-28,China,51158,Bonnie Yin,1840,40,1,2019,2,0,...,2,0,0,0,0,0,0,0,0,0
3,2018-09-28,China,51244,Cuicui Zhai,1840,40,1,2019,2,2,...,2,0,0,105,2,0,0,2,0,0
4,2018-09-28,China,53397,Ines Wang,1840,40,1,2019,4,3,...,3,0,1,8,2,0,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20107,2021-09-29,Japan,64660,Maya Ichikawa,2139,39,52,2021,2,0,...,2,0,0,0,0,0,0,0,0,0
20108,2021-09-29,Japan,67577,Ryoichi Yamamoto,2139,39,52,2021,1,0,...,1,0,0,0,0,0,0,0,0,0
20109,2021-09-29,Japan,68123,Shu Nozawa,2139,39,52,2021,1,0,...,1,0,0,0,0,0,0,0,0,0
20110,2021-09-29,South Korea,57612,Eric Park,2139,39,52,2021,1,0,...,1,0,0,0,0,0,0,0,0,0


In [227]:
df_merged[['Rank5', 'Rank5_cax']].sum()

Rank5        3538
Rank5_cax     951
dtype: int64

In [48]:
df = helper.get_dataframe_from_sqlserver_query('select * from dailysalesdata')
df


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



Unnamed: 0,id,MainDate,MarketName,SalesRepID,SalesRepName,WeekBooked,WeekNum,SortOrder,FiscalYear,hasBooked,...,Rank1-4,Rank5,Rank6-10,ProgramDuration_cax,EnteredByUser_cax,ExcelImport_cax,Website_cax,Rank1-4_cax,Rank5_cax,Rank6-10_cax
0,1,2018-09-28,China,1997,Christy Liu,1840,40,1,2019,2,...,2,0,0,18,1,0,1,2,0,0
1,2,2018-09-28,China,50978,Qian Liu,1840,40,1,2019,1,...,1,0,0,8,1,0,0,1,0,0
2,3,2018-09-28,China,51158,Bonnie Yin,1840,40,1,2019,2,...,2,0,0,0,0,0,0,0,0,0
3,4,2018-09-28,China,51244,Cuicui Zhai,1840,40,1,2019,2,...,2,0,0,105,2,0,0,2,0,0
4,5,2018-09-28,China,53397,Ines Wang,1840,40,1,2019,4,...,3,0,1,8,2,0,1,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20107,20108,2021-09-29,Japan,64660,Maya Ichikawa,2139,39,52,2021,2,...,2,0,0,0,0,0,0,0,0,0
20108,20109,2021-09-29,Japan,67577,Ryoichi Yamamoto,2139,39,52,2021,1,...,1,0,0,0,0,0,0,0,0,0
20109,20110,2021-09-29,Japan,68123,Shu Nozawa,2139,39,52,2021,1,...,1,0,0,0,0,0,0,0,0,0
20110,20111,2021-09-29,South Korea,57612,Eric Park,2139,39,52,2021,1,...,1,0,0,0,0,0,0,0,0,0


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20112 entries, 0 to 20111
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   20112 non-null  int64 
 1   MainDate             20112 non-null  object
 2   MarketName           20112 non-null  object
 3   SalesRepID           20112 non-null  int64 
 4   SalesRepName         20112 non-null  object
 5   WeekBooked           20112 non-null  int64 
 6   WeekNum              20112 non-null  int64 
 7   SortOrder            20112 non-null  int64 
 8   FiscalYear           20112 non-null  int64 
 9   hasBooked            20112 non-null  int64 
 10  hasCaxed             20112 non-null  int64 
 11  ProgramDuration      20112 non-null  int64 
 12  EnteredByUser        20112 non-null  int64 
 13  ExcelImport          20112 non-null  int64 
 14  Website              20112 non-null  int64 
 15  Rank1-4              20112 non-null  int64 
 16  Rank

In [50]:
df.columns

Index(['id', 'MainDate', 'MarketName', 'SalesRepID', 'SalesRepName',
       'WeekBooked', 'WeekNum', 'SortOrder', 'FiscalYear', 'hasBooked',
       'hasCaxed', 'ProgramDuration', 'EnteredByUser', 'ExcelImport',
       'Website', 'Rank1-4', 'Rank5', 'Rank6-10', 'ProgramDuration_cax',
       'EnteredByUser_cax', 'ExcelImport_cax', 'Website_cax', 'Rank1-4_cax',
       'Rank5_cax', 'Rank6-10_cax'],
      dtype='object')

In [58]:
df = helper.get_dataframe_from_sqlserver_query('select * from dailysalesdata')

num_cols = ['hasBooked',
       'hasCaxed', 'ProgramDuration', 'EnteredByUser', 'ExcelImport',
       'Website', 'Rank1-4', 'Rank5', 'Rank6-10', 'ProgramDuration_cax',
       'EnteredByUser_cax', 'ExcelImport_cax', 'Website_cax', 'Rank1-4_cax',
       'Rank5_cax', 'Rank6-10_cax']
nonnum = ['MarketName', 'SalesRepID', 'SalesRepName',]

df =(
df.assign(
    MainDate = lambda _df: pd.to_datetime(_df['MainDate']).dt.normalize()
)
.drop('id',axis=1)
)

numeric = num_cols
non_num = nonnum
d = {**{x: 'first' for x in non_num}, **{x: 'sum' for x in numeric}}


# df = df.resample('1D').agg(d)



pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



In [69]:
df =df.groupby(nonnum).resample('1D', on='MainDate').agg(d).drop(nonnum, axis=1).reset_index()


In [70]:
df[df['SalesRepName'] == 'Christy Liu']

Unnamed: 0,MarketName,SalesRepID,SalesRepName,MainDate,hasBooked,hasCaxed,ProgramDuration,EnteredByUser,ExcelImport,Website,Rank1-4,Rank5,Rank6-10,ProgramDuration_cax,EnteredByUser_cax,ExcelImport_cax,Website_cax,Rank1-4_cax,Rank5_cax,Rank6-10_cax
5566,China,1997,Christy Liu,2018-09-28,2,2,18,1,0,1,2,0,0,18,1,0,1,2,0,0
5567,China,1997,Christy Liu,2018-09-29,2,0,4,2,0,0,2,0,0,0,0,0,0,0,0,0
5568,China,1997,Christy Liu,2018-09-30,1,0,2,1,0,0,1,0,0,0,0,0,0,0,0,0
5569,China,1997,Christy Liu,2018-10-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5570,China,1997,Christy Liu,2018-10-02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5820,China,1997,Christy Liu,2019-06-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5821,China,1997,Christy Liu,2019-06-10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5822,China,1997,Christy Liu,2019-06-11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5823,China,1997,Christy Liu,2019-06-12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:

df[df['SalesRepName'] == 'Christy Liu']

Unnamed: 0,MarketName,SalesRepID,SalesRepName,MainDate,hasBooked,hasCaxed,ProgramDuration,EnteredByUser,ExcelImport,Website,Rank1-4,Rank5,Rank6-10,ProgramDuration_cax,EnteredByUser_cax,ExcelImport_cax,Website_cax,Rank1-4_cax,Rank5_cax,Rank6-10_cax
5566,China,1997,Christy Liu,2018-09-28,2,2,18,1,0,1,2,0,0,18,1,0,1,2,0,0
5567,China,1997,Christy Liu,2018-09-29,2,0,4,2,0,0,2,0,0,0,0,0,0,0,0,0
5568,China,1997,Christy Liu,2018-09-30,1,0,2,1,0,0,1,0,0,0,0,0,0,0,0,0
5569,China,1997,Christy Liu,2018-10-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5570,China,1997,Christy Liu,2018-10-02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5820,China,1997,Christy Liu,2019-06-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5821,China,1997,Christy Liu,2019-06-10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5822,China,1997,Christy Liu,2019-06-11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5823,China,1997,Christy Liu,2019-06-12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
num_cols = ['hasBooked',
       'hasCaxed', 'ProgramDuration', 'EnteredByUser', 'ExcelImport',
       'Website', 'Rank1-4', 'Rank5', 'Rank6-10', 'ProgramDuration_cax',
       'EnteredByUser_cax', 'ExcelImport_cax', 'Website_cax', 'Rank1-4_cax',
       'Rank5_cax', 'Rank6-10_cax']

In [24]:
(
df.assign(
    MainDate = lambda _df: pd.to_datetime(_df['MainDate']).dt.normalize()
)
.set_index(['MainDate'])
.drop('id',axis=1)
.resample('1D').sum()
)


The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,SalesRepID,WeekBooked,WeekNum,SortOrder,FiscalYear,hasBooked,hasCaxed,ProgramDuration,EnteredByUser,ExcelImport,...,Rank1-4,Rank5,Rank6-10,ProgramDuration_cax,EnteredByUser_cax,ExcelImport_cax,Website_cax,Rank1-4_cax,Rank5_cax,Rank6-10_cax
MainDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-09-28,2924865,99360,2160,54,109026,117,37,1243,80,0,...,100,11,6,372,28,0,9,30,6,1
2018-09-29,2343031,80960,1760,44,88836,93,28,922,57,1,...,77,10,6,332,20,0,8,24,4,0
2018-09-30,2570553,84640,1840,46,92874,127,47,597,103,1,...,109,8,10,348,35,0,12,37,6,4
2018-10-01,1388687,47840,1040,26,52494,45,11,479,28,1,...,37,3,5,79,10,0,1,10,0,1
2018-10-02,1434931,49680,1080,27,54513,31,8,439,15,1,...,23,7,1,170,4,0,4,6,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-25,129063,6417,117,156,6063,3,0,36,1,0,...,3,0,0,0,0,0,0,0,0,0
2021-09-26,1449,2139,39,52,2021,1,0,34,1,0,...,1,0,0,0,0,0,0,0,0,0
2021-09-27,494666,17112,312,416,16168,12,0,376,4,0,...,9,2,1,0,0,0,0,0,0,0
2021-09-28,611776,25668,468,624,24252,16,0,273,3,0,...,14,1,1,0,0,0,0,0,0,0
