# Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import date

# Data Import & join
### Datasets: PALMS
First ensure that none of the files are duplicated by checking control sums.

In [2]:
start_year = 2015
start_month = 1
n_files = 81

df_temp = pd.read_csv("data/Region_Summary_PALMS_Report_2015_01.csv", index_col=0, encoding="ISO-8859-1")
sum_previous = df_temp.sum()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
    
    df_temp = pd.read_csv(f"data/Region_Summary_PALMS_Report_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    sum_current = df_temp.sum()
    
    # If all of the column sums are the same, then sum of the boolean comparison on the left
    # will be equal to the number of columns seen on the right
    if (sum_previous == sum_current).sum() == sum_current.shape[0]:
        print("Duplicated readings")
        print(f"Current file: region-palms-report_{year}_{month}.csv")
#         break
    else:
        sum_previous = sum_current

The loop hasn't been broken for any of the instances which means that the files are not duplicate. At least they are not positioned month by month but it is even more unlikely that a duplicated file has been saved in a file where the month differs by more than one.

Just to double-check lets check the condition for the same file.

In [3]:
(sum_current == sum_current).sum() == sum_current.shape[0]

True

As expected - everything is working correctly. Lets import the files and concatenate them.

In [4]:
start_year = 2014
start_month = 12
n_files = 82

df_palms = pd.DataFrame()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
#     print(f"region-palms-report_{year}_{month}.csv")
    
    df_temp = pd.read_csv(f"data/Region_Summary_PALMS_Report_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    df_temp["palms_date"] = date(year, int(month), 1)

    df_palms = pd.concat([df_palms, df_temp])

column_list = df_palms.columns.tolist()
column_list = column_list[-3:-1] + column_list[:-3] + [column_list[-1]]
df_palms = df_palms[column_list]

df_palms.reset_index(inplace=True, drop=True)
df_palms

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,palms_date
0,202,1,4,0,0,0,0,1,2,0,5,0,2,0,0,2015-01-01
1,1001,1,4,0,0,0,0,0,5,0,6,1,3,150,0,2015-01-01
2,1060,1,1,0,0,0,0,0,0,0,0,0,0,0,0,2015-01-01
3,702,1,4,0,0,0,0,0,4,0,0,0,2,0,0,2015-01-01
4,1634,1,3,0,0,1,0,1,0,0,5,0,1,0,0,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38708,3005,32,2,1,1,0,0,1,2,2,1,0,0,100,0,2021-10-01
38709,3015,32,4,0,0,0,0,3,5,18,11,2,2,356,1,2021-10-01
38710,294,32,1,0,0,3,0,0,0,2,0,0,0,0,0,2021-10-01
38711,617,32,3,1,0,0,0,7,0,1,3,0,0,524,0,2021-10-01


### Dataset: database

In [5]:
df_database = pd.read_csv("data/database_data.csv", index_col=0, encoding="ISO-8859-1")
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date
0,1034,23,"Medical, Chiropractor",,07/01/2004,12/01/2010
1,909,23,"Trades, Renovations-Remodeling",,02/01/2008,12/01/2008
2,2062,23,"Food/Beverages, Chef",,09/01/2010,12/01/2011
3,947,23,"Gifts, Gift Baskets",947.0,03/01/2007,04/07/2008
4,2311,23,"Mortgage, Mortgage Broker",,07/01/2007,07/01/2008
5,1536,23,"Real Estate Services, Residential Real Estate ...",,07/01/2007,04/01/2021
6,2486,23,"Insurance, Life,Health and Disability Insurance",,07/01/2007,11/01/2009
7,2322,23,"Financial, Investment Advisor",,07/01/2007,04/01/2009
8,878,23,"Health and Wellness, Massage Therapist",,07/01/2007,07/01/2008
9,753,23,"Insurance, General-Motor Insurance",947.0,10/01/2007,10/01/2008


In [6]:
df_database["join_date"] = pd.to_datetime(df_database["join_date"], format='%m/%d/%Y', errors='coerce')
df_database["renewal_date"] = pd.to_datetime(df_database["renewal_date"], format='%m/%d/%Y', errors='coerce')
# df_database["sponsor_ID"] = pd.to_numeric(df_database["sponsor_ID"], errors='coerce', downcast='Int32')
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01
5,1536,23,"Real Estate Services, Residential Real Estate ...",,2007-07-01,2021-04-01
6,2486,23,"Insurance, Life,Health and Disability Insurance",,2007-07-01,2009-11-01
7,2322,23,"Financial, Investment Advisor",,2007-07-01,2009-04-01
8,878,23,"Health and Wellness, Massage Therapist",,2007-07-01,2008-07-01
9,753,23,"Insurance, General-Motor Insurance",947.0,2007-10-01,2008-10-01


### Dataset: dropped_members

In [7]:
df_dropped = pd.read_csv("data/dropped_members.csv", index_col=0, encoding="ISO-8859-1")
df_dropped.head(10)

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637.0,9,Left Company,11/03/2021
1,1185.0,23,Changed Job,10/22/2021
2,1225.0,14,Takes Too Much Time,10/13/2021
3,2138.0,15,Did Not Renew - No reason Given,11/01/2021
4,1610.0,15,Other Reason (see notes),10/12/2021
5,2574.0,12,Other Reason (see notes),10/28/2021
6,629.0,21,Changed Job,10/28/2021
7,722.0,13,Takes Too Much Time,10/22/2021
8,2580.0,23,Not Enough Referrals,09/23/2021
9,2336.0,6,Scheduling Conflicts,10/20/2021


In [8]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2223 entries, 0 to 2237
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_ID     2220 non-null   float64
 1   chapter_ID  2223 non-null   int64  
 2   reason      1355 non-null   object 
 3   drop_date   2223 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 86.8+ KB


Seems like there are some missing values in ```user_ID``` column, which is odd. There shouldn't be any missings values in this table.

In [9]:
df_dropped[df_dropped["user_ID"].isna()]

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
884,,11,Personal Reasons,01/10/2018
1053,,31,Going back to School,03/28/2017
2147,,9,,01/06/2010


All those records are listed from a time period before the PALMS data that is being looked into, so those records can be dropped.

In [10]:
df_dropped.dropna(subset=["user_ID"], inplace=True)
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2220 entries, 0 to 2237
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_ID     2220 non-null   float64
 1   chapter_ID  2220 non-null   int64  
 2   reason      1353 non-null   object 
 3   drop_date   2220 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 86.7+ KB


In [11]:
df_dropped["drop_date"] = pd.to_datetime(df_dropped["drop_date"], format='%m/%d/%Y', errors='coerce')
df_dropped["user_ID"] = pd.to_numeric(df_dropped["user_ID"], downcast='integer')
df_dropped.head()

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637,9,Left Company,2021-11-03
1,1185,23,Changed Job,2021-10-22
2,1225,14,Takes Too Much Time,2021-10-13
3,2138,15,Did Not Renew - No reason Given,2021-11-01
4,1610,15,Other Reason (see notes),2021-10-12


## Ensure data Integrity
 1. Calculate correct membership length for chapter transfers/rejoins.
 2. Remove drop date from member transfers/rejoins **IF** there is a membership continuity.
 
 
 ### 1. Calculate correct membership length for chapter transfers/rejoins.

In [12]:
df_transfers = df_database.groupby("user_ID")[["chapter_ID"]].count().copy()
df_transfers.loc[df_transfers.index == 3089] = 0
df_transfers.rename({"chapter_ID": "re_joins"}, axis=1, inplace=True)

df_database = df_database.merge(df_transfers, how="left", on="user_ID")
df_database

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date,re_joins
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01,1
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01,2
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01,2
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07,3
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01,1
...,...,...,...,...,...,...,...
3562,983,0,"Finance & Insurance, Health Insurance",2219.0,2021-08-01,2022-08-01,1
3563,1052,0,"Finance & Insurance, Property & Casualty Insur...",2219.0,2021-06-01,2022-06-01,1
3564,210,0,"Real Estate Services, Home Inspection",,2021-10-01,2022-10-01,1
3565,440,0,"Construction, HVAC - Heating & Air",1052.0,2021-11-01,2022-11-01,1


In [13]:
user_ID_list = df_palms["user_ID"].unique()
user_ID_list[:10]

array([ 202, 1001, 1060,  702, 1634, 2347, 2065, 2354, 2673, 1453],
      dtype=int64)

In [14]:
df_transfers = df_database.loc[(df_database["re_joins"] > 1) & (df_database["user_ID"].isin(user_ID_list))].copy()
df_transfers.sort_values("join_date", inplace=True)
df_transfers

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date,re_joins
296,340,9,"Printing, Printer",,1999-06-01,2013-09-01,2
500,1426,22,"Insurance, Property & Casualty Insurance",,2000-08-01,2009-01-01,2
793,1048,20,"Marketing, Marketing Services",,2005-11-01,2006-11-01,2
515,1766,12,"Real Estate, Real Estate Sales Representative ...",,2006-02-01,2008-11-01,2
801,1761,20,"Mortgage, Mortgage Broker",743.0,2006-07-01,2007-08-27,2
...,...,...,...,...,...,...,...
1647,1102,10,"Legal & Accounting, Bookkeeping",,2021-10-01,2022-10-01,2
787,2288,12,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2
2472,1044,18,"Construction, Painter & Decorator",,2021-10-01,2022-10-01,2
2650,7,19,"Construction, Electrician",,2021-11-01,2023-02-01,5


In [15]:
df_temp = df_transfers.copy()
df_temp = df_temp.merge(df_dropped, how="left", on=["user_ID", "chapter_ID"])
df_temp.sort_values(["user_ID", "join_date", "drop_date"], ascending=True, inplace=True)
df_temp.drop_duplicates(["user_ID", "chapter_ID", "join_date"], keep="last", inplace=True)
df_temp.drop_duplicates(["user_ID", "chapter_ID", "drop_date"], keep="first", inplace=True)

df_temp.drop(["industry", "sponsor_ID", "renewal_date", "reason"], axis=1, inplace=True)

df2 = df_temp["drop_date"].isnull().groupby(df_temp["user_ID"]).sum().astype(int).reset_index(name='count')
df_temp = df_temp.merge(df2, on="user_ID")

for index, row in df_temp.loc[df_temp["count"] > 1].iterrows():
    cond1 = (df_temp["user_ID"] == row["user_ID"]) & (df_temp["chapter_ID"] == row["chapter_ID"]) & (df_temp["drop_date"].isna())
    cond2 = (df_palms["user_ID"] == row["user_ID"]) & (df_palms["chapter_ID"] == row["chapter_ID"])
    df_temp.loc[cond1, "drop_date"] = df_palms.loc[cond2, "palms_date"].max()

df_temp.dropna(subset=["drop_date"], inplace=True)
df_temp["drop_date"] = pd.to_datetime(df_temp["drop_date"])

df_temp["additional_months"] = (df_temp["drop_date"] - df_temp["join_date"]) / np.timedelta64(1, 'M')
df_temp["additional_months"] = df_temp["additional_months"].round().astype(int)

df_temp["additional_months"] = df_temp.groupby("user_ID")["additional_months"].shift(1, fill_value=0)
df_temp["additional_months_cumsum"] = df_temp.groupby(["user_ID"])["additional_months"].cumsum()

df_temp = df_temp.loc[df_temp["additional_months_cumsum"] != 0]
df_temp.drop(["re_joins", "count", "additional_months", "drop_date", "join_date"], axis=1, inplace=True)

df_temp.head(10)

Unnamed: 0,user_ID,chapter_ID,additional_months_cumsum
1,7,26,8
8,32,25,11
11,39,18,24
15,50,15,11
17,68,24,25
22,108,9,62
28,147,19,37
34,188,18,7
36,208,13,19
43,267,14,23


## Join data - create a master dataframe

In [16]:
df_master = df_palms.copy()
df_master = df_master.merge(df_database, how="left", on=["user_ID", "chapter_ID"])
df_master = df_master.merge(df_dropped, how="left", on=["user_ID", "chapter_ID"])

df_master.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date
27991,1518,18,4,0,0,0,0,1,0,0,...,0,4,2019-03-01,"Automotive, Automotive Sales and Service",3086.0,2018-06-01,2020-06-01,1,Other Reason (see notes),2020-01-07
30979,1314,6,4,0,0,0,0,1,12,0,...,21454,3,2019-08-01,"Real Estate Services, Commercial Real Estate",237.0,2016-04-01,2022-10-01,1,,NaT
15072,250,23,5,0,0,0,0,1,6,0,...,7517,5,2017-06-01,"Real Estate, Real Estate Sales Representative ...",1012.0,2015-05-01,2018-05-01,1,Other Reason (see notes),2017-07-11
39192,2901,12,5,0,0,0,0,1,5,4,...,5425,25,2020-09-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,2,,NaT
2948,2272,32,4,0,0,0,0,5,1,3,...,180,0,2015-07-01,"Photography, Photographer",,2010-03-22,2017-02-01,1,Personal Reasons,2016-01-12
7274,3055,32,4,0,0,0,0,4,0,9,...,519,0,2016-04-01,"Advertising & Marketing, Printer",2212.0,2013-12-01,2021-06-01,2,Did Not Renew - No reason Given,2021-06-01
25424,1802,19,4,0,0,0,0,0,4,0,...,491,10,2018-11-01,"Employment Activities, Recruiter",,2015-05-01,2022-05-01,1,,NaT
1743,2861,12,4,0,0,0,0,1,3,1,...,545,0,2015-05-01,"Health and Wellness, Fitness Trainer",,2013-03-01,2016-03-01,1,Other Reason (see notes),2016-03-01
30003,1739,19,4,0,0,0,0,0,3,5,...,0,11,2019-06-01,"Finance & Insurance, Property & Casualty Insur...",108.0,2018-12-01,2022-02-01,1,,NaT
33433,1924,8,2,0,0,0,0,0,0,0,...,0,0,2019-12-01,"Consulting, Energy Consultant",1974.0,2018-12-01,2019-12-01,1,Other Reason (see notes),2019-12-05


# Data cleaning & aggregation
## Remove duplicates

In [17]:
df_master_clean = df_master.copy()

df_master_clean.drop(["industry", "sponsor_ID", "re_joins"], axis=1, inplace=True)

df_master_clean["palms_date"] = pd.to_datetime(df_master_clean["palms_date"], errors='coerce')
df_master_clean["renewal_date"] = pd.to_datetime(df_master_clean["renewal_date"], errors='coerce')

df_master_clean.shape[0]

47901

In [18]:
df_master_clean.sort_values(["palms_date", "join_date", "drop_date"], inplace=True)

df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16].tolist() + ["join_date"], keep="last", inplace=True)
df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16], inplace=True)
df_master_clean.shape[0]

38712

## Get relative renewal date for data aggregation

In [19]:
df_master_clean = df_master_clean.merge(df_temp, how="left", on=["user_ID", "chapter_ID"])
df_master_clean.loc[df_master_clean["additional_months_cumsum"].isna(), "additional_months_cumsum"] = 0
df_master_clean["additional_months_cumsum"] = df_master_clean["additional_months_cumsum"].astype(int)
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,V,1-2-1,TYFCB,CEU,palms_date,join_date,renewal_date,reason,drop_date,additional_months_cumsum
0,447,9,4,0,0,0,0,1,1,0,...,0,7,0,0,2015-01-01,1998-05-01,2022-07-01,,NaT,0
1,1366,9,4,0,0,0,0,4,0,0,...,0,7,4975,0,2015-01-01,2002-09-01,2022-06-01,,NaT,0
2,1197,9,4,0,0,0,0,2,3,0,...,0,3,21126,0,2015-01-01,2006-11-01,2018-02-01,No Reason Entered,2018-01-11,0
3,1035,9,4,0,0,0,0,0,7,0,...,0,6,500,0,2015-01-01,2007-07-01,2015-05-01,No Reason Entered,2015-06-04,0
4,1536,23,3,0,0,0,1,0,3,0,...,0,0,0,0,2015-01-01,2007-07-01,2021-04-01,Member Transferred BNI Chapters,2020-12-18,0


In [20]:
df_master_clean["year_of_membership"] = (df_master_clean["palms_date"] - df_master_clean["join_date"]) \
    / np.timedelta64(1, 'M') + df_master_clean["additional_months_cumsum"]
df_master_clean["year_of_membership"] = df_master_clean["year_of_membership"] // 12
df_master_clean["months_to_renewal"] = (df_master_clean["renewal_date"] - df_master_clean["palms_date"]) \
    / np.timedelta64(1, 'M')
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"].round().astype(int)
df_master_clean["years_to_renewal"] = (df_master_clean["months_to_renewal"] - 1) // 12
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"] % 12

# Substitute "months_to_renewal" value 0 with 12 for ease of aggregation later on
df_master_clean.loc[df_master_clean["months_to_renewal"] == 0, "months_to_renewal"] = 12

for index, row in df_master_clean.iterrows():
    df_master_clean.at[index, 'relative_renewal_date'] = row['renewal_date'] - pd.DateOffset(years=row['years_to_renewal'])

df_master_clean.drop(["years_to_renewal", "additional_months_cumsum"], axis=1, inplace=True)
df_master_clean.sample(10, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,join_date,renewal_date,reason,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
31267,2581,31,5,0,0,0,0,0,1,0,...,0,13,2020-09-01,2018-03-01,2021-03-01,Personal Reasons,2021-01-31,2.0,6,2021-03-01
9055,3042,9,3,0,0,0,0,0,1,0,...,1500,5,2016-12-01,2014-08-12,2018-08-01,Takes Too Much Time,2018-01-11,2.0,8,2017-08-01
4955,1129,12,4,0,0,0,0,3,0,2,...,327,0,2016-02-01,2015-12-01,2017-06-01,Takes Too Much Time,2017-03-28,0.0,4,2016-06-01
30619,2724,32,4,0,0,0,0,2,1,2,...,1932,4,2020-08-01,2012-03-01,2022-08-01,,NaT,8.0,12,2021-08-01
28844,2598,25,2,0,0,0,0,0,0,0,...,0,0,2020-04-01,2019-07-01,2020-08-01,Not Right Fit with Company,2020-03-12,0.0,4,2020-08-01
29693,1902,24,4,0,0,0,0,1,0,0,...,1531,4,2020-06-01,2017-06-01,2022-05-01,,NaT,3.0,11,2021-05-01
22808,2840,6,4,1,0,0,0,0,3,0,...,101,14,2019-04-01,2019-02-01,2020-04-01,No Reason Entered,2019-06-03,0.0,12,2020-04-01
3895,1413,12,2,0,0,0,0,1,3,0,...,317,0,2015-12-01,2011-02-01,2016-08-01,,2016-08-01,4.0,8,2016-08-01
9592,1231,19,4,0,0,0,0,1,1,1,...,0,5,2017-01-01,2016-01-01,2020-07-01,Personal Reasons,2020-07-01,1.0,6,2017-07-01
14334,2222,15,3,0,0,0,0,3,5,8,...,1232,2,2017-11-01,2017-07-01,2022-01-01,,NaT,0.0,2,2018-01-01


In [21]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2822) & (df_master_clean["relative_renewal_date"] == "2019-05-01")]

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,join_date,renewal_date,reason,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
17219,2822,21,5,0,0,0,0,1,4,1,...,960,17,2018-05-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,12,2019-05-01
17689,2822,21,4,0,0,0,0,0,9,3,...,1315,2,2018-06-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,11,2019-05-01
18149,2822,21,4,0,0,0,0,1,5,0,...,830,4,2018-07-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,10,2019-05-01
18610,2822,21,4,0,1,0,0,1,3,2,...,971,2,2018-08-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,9,2019-05-01
19080,2822,21,3,0,1,0,0,2,3,0,...,2702,13,2018-09-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,8,2019-05-01
19553,2822,21,4,0,0,0,0,1,2,0,...,607,2,2018-10-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,7,2019-05-01
20036,2822,21,4,0,0,0,0,0,0,0,...,645,3,2018-11-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,0.0,6,2019-05-01
20516,2822,21,3,0,0,0,0,0,3,0,...,300,3,2018-12-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,1.0,5,2019-05-01
21021,2822,21,4,0,0,0,0,0,3,0,...,390,4,2019-01-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,1.0,4,2019-05-01
21530,2822,21,4,0,0,0,0,2,4,1,...,97,4,2019-02-01,2017-11-01,2020-05-01,Not Enough Referrals,2020-01-08,1.0,3,2019-05-01


## Aggregate 9-months data
Ensure sure that each groupby sum is aggregated for 9 months, not less.

In [43]:
df_pre_agg = df_master_clean.copy()

df_pre_agg["control_count"] = 1
df_pre_agg = df_pre_agg.loc[df_pre_agg["months_to_renewal"] >= 4]

df_pre_agg.drop_duplicates(subset=df_pre_agg.columns[:16], inplace=True)

df_pre_agg.drop(["join_date",
                 "renewal_date",
                 "palms_date",
                 "drop_date",
                 "months_to_renewal"], axis=1, inplace=True)

df_pre_agg.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,reason,year_of_membership,relative_renewal_date,control_count
26831,2041,15,3,0,0,0,0,2,6,2,3,0,3,4075,2,,0.0,2020-10-01,1
7599,1203,32,2,1,0,0,1,0,1,1,1,0,4,35,0,No Reason Entered,0.0,2017-07-01,1
21668,1462,9,0,0,0,4,0,0,0,0,5,0,0,906,0,"Company related (e.g. Changed Jobs, Left Compa...",1.0,2019-09-01,1
15418,295,17,4,0,0,0,0,1,1,4,5,0,4,0,0,No Reason Entered,6.0,2019-02-01,1
1981,2677,19,4,0,0,0,0,2,3,0,2,2,0,0,0,Other Reason (see notes),0.0,2016-05-01,1
32389,712,23,4,0,0,0,0,1,0,0,0,0,4,625,9,No Reason Entered,1.0,2021-10-01,1
10130,2111,25,2,2,0,0,0,1,2,0,0,0,0,4735,1,Changed Job,0.0,2017-06-01,1
1467,1361,10,3,0,0,0,1,5,0,1,1,1,0,1388,0,Other Reason (see notes),1.0,2016-01-01,1
35461,1906,11,4,0,0,0,0,5,2,2,0,0,5,1993,0,,3.0,2022-01-01,1
18908,1464,11,1,2,0,0,0,0,0,0,0,0,2,0,0,Other Reason (see notes),2.0,2019-02-01,1


In [44]:
df_agg = df_pre_agg.copy()
df_agg = df_agg.groupby(["user_ID", "chapter_ID", "relative_renewal_date"]).sum()
df_agg.reset_index(drop=False, inplace=True)
df_agg = df_agg.loc[df_agg["control_count"] >= 9]
df_agg["year_of_membership"] = (df_agg["year_of_membership"]/9).round().astype(int)
df_agg.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
2245,1622,32,2017-01-01,18,13,0,0,2,11,1,34,10,1,28,1087,0,0,9
23,10,19,2018-10-01,30,4,1,0,2,3,17,3,45,0,26,3648,57,1,9
2070,1505,18,2021-02-01,35,0,0,1,1,11,63,2,10,1,10,137945,33,2,9
3199,2285,15,2019-05-01,35,0,0,0,3,10,12,14,9,0,46,11719,41,1,9
2007,1460,25,2020-05-01,37,1,0,0,0,17,7,0,7,1,51,3478,53,0,9
2739,1952,32,2021-09-01,33,3,0,0,0,17,1,5,4,0,37,64919,8,0,9
1107,830,26,2019-11-01,32,2,2,0,1,8,19,0,6,2,23,39394,27,8,9
843,619,18,2018-08-01,33,0,0,0,4,14,4,3,20,2,36,5422,29,3,9
3771,2695,6,2020-07-01,32,3,0,0,3,21,19,16,70,1,42,42424,66,0,9
2188,1587,17,2017-04-01,34,3,1,0,1,7,13,13,13,3,25,7944,21,3,9


In [45]:
df_agg.shape

(2431, 18)

In [46]:
df_agg.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
0,0,31,2016-04-01,33,0,0,1,0,21,7,33,32,1,37,129784,45,0,9
3,2,18,2017-05-01,33,2,2,0,0,6,17,11,22,1,26,9285,1,0,9
4,2,18,2018-05-01,35,1,0,0,2,6,24,19,36,1,20,7263,10,1,9
5,2,18,2019-05-01,36,0,0,0,1,9,8,23,19,3,28,1860,31,2,9
6,2,18,2020-05-01,33,1,0,0,3,10,13,19,47,4,30,6668,16,3,9


In [47]:
df_agg.describe()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
count,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0,2431.0
mean,1589.2826,17.929247,33.141917,1.573838,0.380502,0.485397,1.532291,10.388318,25.034554,10.196627,24.776224,3.842452,43.795969,28536.88,42.130399,1.745784,9.006993
std,881.07981,8.383352,3.653289,1.88327,1.103499,1.614394,1.583507,7.58001,19.707648,11.59807,21.189254,3.966457,23.817955,76358.42,43.158716,2.376401,0.097037
min,0.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,9.0
25%,843.5,10.5,31.0,0.0,0.0,0.0,0.0,5.0,13.0,3.0,11.0,1.0,29.0,5310.0,20.0,0.0,9.0
50%,1607.0,18.0,34.0,1.0,0.0,0.0,1.0,9.0,21.0,7.0,19.0,3.0,39.0,12516.0,36.0,1.0,9.0
75%,2338.5,25.0,36.0,2.0,0.0,0.0,2.0,14.0,32.0,13.0,32.0,5.0,53.0,27987.5,51.0,3.0,9.0
max,3080.0,32.0,40.0,22.0,15.0,22.0,11.0,62.0,224.0,126.0,296.0,36.0,276.0,1467394.0,829.0,22.0,11.0


In [48]:
df_database.head()

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date,re_joins
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01,1
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01,2
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01,2
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07,3
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01,1
