# Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import date

# Data Import & join
### Datasets: PALMS
First ensure that none of the files are duplicated by checking control sums.

In [2]:
start_year = 2015
start_month = 1
n_files = 81

df_temp = pd.read_csv("data/Region_Summary_PALMS_Report_2015_01.csv", index_col=0, encoding="ISO-8859-1")
sum_previous = df_temp.sum()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
    
    df_temp = pd.read_csv(f"data/Region_Summary_PALMS_Report_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    sum_current = df_temp.sum()
    
    # If all of the column sums are the same, then sum of the boolean comparison on the left
    # will be equal to the number of columns seen on the right
    if (sum_previous == sum_current).sum() == sum_current.shape[0]:
        print("Duplicated readings")
        print(f"Current file: region-palms-report_{year}_{month}.csv")
#         break
    else:
        sum_previous = sum_current

The loop hasn't been broken for any of the instances which means that the files are not duplicate. At least they are not positioned month by month but it is even more unlikely that a duplicated file has been saved in a file where the month differs by more than one.

Just to double-check lets check the condition for the same file.

In [3]:
(sum_current == sum_current).sum() == sum_current.shape[0]

True

As expected - everything is working correctly. Lets import the files and concatenate them.

In [4]:
start_year = 2014
start_month = 12
n_files = 82

df_palms = pd.DataFrame()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
#     print(f"region-palms-report_{year}_{month}.csv")
    
    df_temp = pd.read_csv(f"data/Region_Summary_PALMS_Report_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    df_temp["palms_date"] = date(year, int(month), 1)

    df_palms = pd.concat([df_palms, df_temp])

column_list = df_palms.columns.tolist()
column_list = column_list[-3:-1] + column_list[:-3] + [column_list[-1]]
df_palms = df_palms[column_list]

df_palms.reset_index(inplace=True, drop=True)
df_palms

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,palms_date
0,202,1,4,0,0,0,0,1,2,0,5,0,2,0,0,2015-01-01
1,1001,1,4,0,0,0,0,0,5,0,6,1,3,150,0,2015-01-01
2,1060,1,1,0,0,0,0,0,0,0,0,0,0,0,0,2015-01-01
3,702,1,4,0,0,0,0,0,4,0,0,0,2,0,0,2015-01-01
4,1634,1,3,0,0,1,0,1,0,0,5,0,1,0,0,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38708,3005,32,2,1,1,0,0,1,2,2,1,0,0,100,0,2021-10-01
38709,3015,32,4,0,0,0,0,3,5,18,11,2,2,356,1,2021-10-01
38710,294,32,1,0,0,3,0,0,0,2,0,0,0,0,0,2021-10-01
38711,617,32,3,1,0,0,0,7,0,1,3,0,0,524,0,2021-10-01


### Dataset: database

In [5]:
df_database = pd.read_csv("data/database_data.csv", index_col=0, encoding="ISO-8859-1")
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date
0,1034,23,"Medical, Chiropractor",,07/01/2004,12/01/2010
1,909,23,"Trades, Renovations-Remodeling",,02/01/2008,12/01/2008
2,2062,23,"Food/Beverages, Chef",,09/01/2010,12/01/2011
3,947,23,"Gifts, Gift Baskets",947.0,03/01/2007,04/07/2008
4,2311,23,"Mortgage, Mortgage Broker",,07/01/2007,07/01/2008
5,1536,23,"Real Estate Services, Residential Real Estate ...",,07/01/2007,04/01/2021
6,2486,23,"Insurance, Life,Health and Disability Insurance",,07/01/2007,11/01/2009
7,2322,23,"Financial, Investment Advisor",,07/01/2007,04/01/2009
8,878,23,"Health and Wellness, Massage Therapist",,07/01/2007,07/01/2008
9,753,23,"Insurance, General-Motor Insurance",947.0,10/01/2007,10/01/2008


In [6]:
df_database["join_date"] = pd.to_datetime(df_database["join_date"], format='%m/%d/%Y', errors='coerce')
df_database["renewal_date"] = pd.to_datetime(df_database["renewal_date"], format='%m/%d/%Y', errors='coerce')
# df_database["sponsor_ID"] = pd.to_numeric(df_database["sponsor_ID"], errors='coerce', downcast='Int32')
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01
5,1536,23,"Real Estate Services, Residential Real Estate ...",,2007-07-01,2021-04-01
6,2486,23,"Insurance, Life,Health and Disability Insurance",,2007-07-01,2009-11-01
7,2322,23,"Financial, Investment Advisor",,2007-07-01,2009-04-01
8,878,23,"Health and Wellness, Massage Therapist",,2007-07-01,2008-07-01
9,753,23,"Insurance, General-Motor Insurance",947.0,2007-10-01,2008-10-01


### Dataset: dropped_members

In [7]:
df_dropped = pd.read_csv("data/dropped_members.csv", index_col=0, encoding="ISO-8859-1")
df_dropped.head(10)

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637.0,9,Left Company,11/03/2021
1,1185.0,23,Changed Job,10/22/2021
2,1225.0,14,Takes Too Much Time,10/13/2021
3,2138.0,15,Did Not Renew - No reason Given,11/01/2021
4,1610.0,15,Other Reason (see notes),10/12/2021
5,2574.0,12,Other Reason (see notes),10/28/2021
6,629.0,21,Changed Job,10/28/2021
7,722.0,13,Takes Too Much Time,10/22/2021
8,2580.0,23,Not Enough Referrals,09/23/2021
9,2336.0,6,Scheduling Conflicts,10/20/2021


In [8]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2223 entries, 0 to 2237
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_ID     2220 non-null   float64
 1   chapter_ID  2223 non-null   int64  
 2   reason      1355 non-null   object 
 3   drop_date   2223 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 86.8+ KB


Seems like there are some missing values in ```user_ID``` column, which is odd. There shouldn't be any missings values in this table.

In [9]:
df_dropped[df_dropped["user_ID"].isna()]

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
884,,11,Personal Reasons,01/10/2018
1053,,31,Going back to School,03/28/2017
2147,,9,,01/06/2010


Disregards the records.

In [10]:
df_dropped.dropna(subset=["user_ID"], inplace=True)
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2220 entries, 0 to 2237
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_ID     2220 non-null   float64
 1   chapter_ID  2220 non-null   int64  
 2   reason      1353 non-null   object 
 3   drop_date   2220 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 86.7+ KB


In [11]:
df_dropped["drop_date"] = pd.to_datetime(df_dropped["drop_date"], format='%m/%d/%Y', errors='coerce')
df_dropped["user_ID"] = pd.to_numeric(df_dropped["user_ID"], downcast='integer')
df_dropped.head()

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637,9,Left Company,2021-11-03
1,1185,23,Changed Job,2021-10-22
2,1225,14,Takes Too Much Time,2021-10-13
3,2138,15,Did Not Renew - No reason Given,2021-11-01
4,1610,15,Other Reason (see notes),2021-10-12


## Ensure data Integrity
 1. Calculate correct membership length for chapter transfers/rejoins.
 2. Remove drop date from member transfers/rejoins **IF** there is a membership continuity.
 
 
 ### 1. Calculate correct membership length for chapter transfers/rejoins.

In [12]:
df_transfers = df_database.groupby("user_ID")[["chapter_ID"]].count().copy()
df_transfers.loc[df_transfers.index == 3089] = 0
df_transfers.rename({"chapter_ID": "re_joins"}, axis=1, inplace=True)

df_database = df_database.merge(df_transfers, how="left", on="user_ID")
df_database

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date,re_joins
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01,1
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01,2
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01,2
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07,3
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01,1
...,...,...,...,...,...,...,...
3562,983,0,"Finance & Insurance, Health Insurance",2219.0,2021-08-01,2022-08-01,1
3563,1052,0,"Finance & Insurance, Property & Casualty Insur...",2219.0,2021-06-01,2022-06-01,1
3564,210,0,"Real Estate Services, Home Inspection",,2021-10-01,2022-10-01,1
3565,440,0,"Construction, HVAC - Heating & Air",1052.0,2021-11-01,2022-11-01,1


In [13]:
user_ID_list = df_palms["user_ID"].unique()
user_ID_list[:10]

array([ 202, 1001, 1060,  702, 1634, 2347, 2065, 2354, 2673, 1453],
      dtype=int64)

In [14]:
df_transfers = df_database.loc[(df_database["re_joins"] > 1) & (df_database["user_ID"].isin(user_ID_list))].copy()
df_transfers.sort_values("join_date", inplace=True)
df_transfers

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date,re_joins
296,340,9,"Printing, Printer",,1999-06-01,2013-09-01,2
500,1426,22,"Insurance, Property & Casualty Insurance",,2000-08-01,2009-01-01,2
793,1048,20,"Marketing, Marketing Services",,2005-11-01,2006-11-01,2
515,1766,12,"Real Estate, Real Estate Sales Representative ...",,2006-02-01,2008-11-01,2
801,1761,20,"Mortgage, Mortgage Broker",743.0,2006-07-01,2007-08-27,2
...,...,...,...,...,...,...,...
1647,1102,10,"Legal & Accounting, Bookkeeping",,2021-10-01,2022-10-01,2
787,2288,12,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2
2472,1044,18,"Construction, Painter & Decorator",,2021-10-01,2022-10-01,2
2650,7,19,"Construction, Electrician",,2021-11-01,2023-02-01,5


Create a dataframe with amount of missing ```drop_date```'s

In [15]:
df_temp = df_transfers.copy()
df_temp = df_temp.merge(df_dropped, how="left", on=["user_ID", "chapter_ID"])
df_temp.sort_values(["user_ID", "join_date", "drop_date"], ascending=True, inplace=True)
df_temp.drop_duplicates(["user_ID", "chapter_ID", "join_date"], keep="last", inplace=True)
df_temp.drop_duplicates(["user_ID", "chapter_ID", "drop_date"], keep="first", inplace=True)

df_temp.drop(["industry", "sponsor_ID", "renewal_date", "reason"], axis=1, inplace=True)

# Get amount of missing 'drop_date' per user
df2 = df_temp["drop_date"].isnull().groupby(df_temp["user_ID"]).sum().astype(int).reset_index(name='count')
df_temp = df_temp.merge(df2, on="user_ID")
df_temp.head()

Unnamed: 0,user_ID,chapter_ID,join_date,re_joins,drop_date,count
0,7,21,2018-05-01,5,2019-01-07,1
1,7,26,2019-01-01,5,2021-10-19,1
2,7,19,2021-11-01,5,NaT,1
3,20,26,2016-11-01,2,2016-11-04,0
4,20,31,2016-11-01,2,2017-03-15,0


Get the amount of months to be added per user re-join

In [16]:
for index, row in df_temp.loc[df_temp["count"] > 1].iterrows():
    cond1 = (df_temp["user_ID"] == row["user_ID"]) & (df_temp["chapter_ID"] == row["chapter_ID"]) & (df_temp["drop_date"].isna())
    cond2 = (df_palms["user_ID"] == row["user_ID"]) & (df_palms["chapter_ID"] == row["chapter_ID"])
    df_temp.loc[cond1, "drop_date"] = df_palms.loc[cond2, "palms_date"].max()

df_temp.dropna(subset=["drop_date"], inplace=True)
df_temp["drop_date"] = pd.to_datetime(df_temp["drop_date"])

df_temp["additional_months"] = (df_temp["drop_date"] - df_temp["join_date"]) / np.timedelta64(1, 'M')
df_temp["additional_months"] = df_temp["additional_months"].round().astype(int)

df_temp["additional_months"] = df_temp.groupby("user_ID")["additional_months"].shift(1, fill_value=0)
df_temp["additional_months_cumsum"] = df_temp.groupby(["user_ID"])["additional_months"].cumsum()

df_temp = df_temp.loc[df_temp["additional_months_cumsum"] != 0]
df_temp.drop(["re_joins", "count", "additional_months", "drop_date", "join_date"], axis=1, inplace=True)

df_temp.head(10)

Unnamed: 0,user_ID,chapter_ID,additional_months_cumsum
1,7,26,8
8,32,25,11
11,39,18,24
15,50,15,11
17,68,24,25
22,108,9,62
28,147,19,37
34,188,18,7
36,208,13,19
43,267,14,23


## Join data - create a master dataframe

In [17]:
df_master = df_palms.copy()
df_master = df_master.merge(df_database, how="left", on=["user_ID", "chapter_ID"])
df_master = df_master.merge(df_dropped, how="left", on=["user_ID", "chapter_ID"])

df_master.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date
27991,1518,18,4,0,0,0,0,1,0,0,...,0,4,2019-03-01,"Automotive, Automotive Sales and Service",3086.0,2018-06-01,2020-06-01,1,Other Reason (see notes),2020-01-07
30979,1314,6,4,0,0,0,0,1,12,0,...,21454,3,2019-08-01,"Real Estate Services, Commercial Real Estate",237.0,2016-04-01,2022-10-01,1,,NaT
15072,250,23,5,0,0,0,0,1,6,0,...,7517,5,2017-06-01,"Real Estate, Real Estate Sales Representative ...",1012.0,2015-05-01,2018-05-01,1,Other Reason (see notes),2017-07-11
39192,2901,12,5,0,0,0,0,1,5,4,...,5425,25,2020-09-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,2,,NaT
2948,2272,32,4,0,0,0,0,5,1,3,...,180,0,2015-07-01,"Photography, Photographer",,2010-03-22,2017-02-01,1,Personal Reasons,2016-01-12
7274,3055,32,4,0,0,0,0,4,0,9,...,519,0,2016-04-01,"Advertising & Marketing, Printer",2212.0,2013-12-01,2021-06-01,2,Did Not Renew - No reason Given,2021-06-01
25424,1802,19,4,0,0,0,0,0,4,0,...,491,10,2018-11-01,"Employment Activities, Recruiter",,2015-05-01,2022-05-01,1,,NaT
1743,2861,12,4,0,0,0,0,1,3,1,...,545,0,2015-05-01,"Health and Wellness, Fitness Trainer",,2013-03-01,2016-03-01,1,Other Reason (see notes),2016-03-01
30003,1739,19,4,0,0,0,0,0,3,5,...,0,11,2019-06-01,"Finance & Insurance, Property & Casualty Insur...",108.0,2018-12-01,2022-02-01,1,,NaT
33433,1924,8,2,0,0,0,0,0,0,0,...,0,0,2019-12-01,"Consulting, Energy Consultant",1974.0,2018-12-01,2019-12-01,1,Other Reason (see notes),2019-12-05


# Data cleaning & aggregation
## Remove duplicates

In [18]:
df_master_clean = df_master.copy()

df_master_clean["palms_date"] = pd.to_datetime(df_master_clean["palms_date"], errors='coerce')
df_master_clean["renewal_date"] = pd.to_datetime(df_master_clean["renewal_date"], errors='coerce')

df_master_clean.shape[0]

47901

In [19]:
df_master_clean.sort_values(["palms_date", "join_date", "drop_date"], inplace=True)

df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16].tolist() + ["join_date"], keep="last", inplace=True)
df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16], inplace=True)
df_master_clean.shape[0]

38712

### Ensure that the renewal date is correct in PALMS data for members who transfered chapters

In [20]:
df_database.head()

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date,re_joins
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01,1
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01,2
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01,2
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07,3
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01,1


In [21]:
df_check_transfers = df_database.loc[df_database["re_joins"] >= 2].copy()
df_check_transfers.sort_values(["user_ID", "join_date", "renewal_date"], inplace=True)

# Drop records which are duplicated. keep='last' as it is most likely the correct record
# (in case if someone entered a membership wrongly)
df_check_transfers.drop_duplicates(["user_ID", "chapter_ID", "join_date"], inplace=True, keep="last")

# Get daterange to cross-check in PALMS data
df_check_transfers["next_join_date"] = df_check_transfers.groupby("user_ID")["join_date"].shift(-1)
df_check_transfers["next_join_date"].fillna(df_check_transfers["renewal_date"], inplace=True)

df_check_transfers.drop(["chapter_ID", "industry", "sponsor_ID", "re_joins"], axis=1, inplace=True)
df_check_transfers = df_check_transfers[["user_ID", "join_date", "next_join_date", "renewal_date"]]
df_check_transfers.rename({"join_date": "join_range_A",
                           "next_join_date": "join_range_B",
                           "renewal_date": "correct_renewal_date"}, axis=1, inplace=True)
df_check_transfers.drop_duplicates(["user_ID", "join_range_A"], inplace=True, keep="last")
df_check_transfers.head(10)

Unnamed: 0,user_ID,join_range_A,join_range_B,correct_renewal_date
3086,7,2018-05-01,2019-01-01,2019-05-01
1192,7,2019-01-01,2020-12-01,2021-05-01
1221,7,2020-12-01,2021-11-01,2023-02-01
2650,7,2021-11-01,2023-02-01,2023-02-01
1351,20,2016-11-01,2017-11-01,2017-11-01
1588,31,2017-08-01,2020-03-01,2020-08-01
2149,31,2020-03-01,2022-05-01,2022-05-01
2904,32,2017-12-01,2018-11-01,2018-12-01
2715,32,2018-11-01,2021-12-01,2021-12-01
1932,37,2020-02-01,2021-02-01,2021-02-01


In [22]:
# Split data - members with any trasnfers and no transfers
transfer_IDs = df_check_transfers["user_ID"].unique().tolist()
df_master_with_transfers = df_master_clean.loc[df_master_clean["user_ID"].isin(transfer_IDs)].copy()
df_master_no_transfers = df_master_clean.loc[~df_master_clean["user_ID"].isin(transfer_IDs)].copy()

# Check if data was split correctly
transfers_shape = df_master_with_transfers.shape[0]
df_master_no_transfers.shape[0] + df_master_with_transfers.shape[0] == df_master_clean.shape[0]

True

In [23]:
df_master_with_transfers = df_master_with_transfers.merge(df_check_transfers, how="left", on=["user_ID"])
df_master_with_transfers.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date,join_range_A,join_range_B,correct_renewal_date
1509,1151,19,4,0,0,0,0,0,7,3,...,"Advertising & Marketing, Web Design",2958.0,2015-05-01,2022-04-01,3,,NaT,2006-10-01,2012-10-01,2009-10-01
1849,1453,19,3,0,0,0,0,1,4,1,...,"Coach, Life Coach",,2015-10-01,2018-04-01,5,Affordability of Costs,2017-07-18,2014-02-13,2014-04-28,2014-02-27
3984,1198,18,5,0,0,0,0,0,3,0,...,"Automotive, Automotive Sales & Leasing",1168.0,2016-02-01,2017-02-01,2,Not Enough Referrals,2018-01-13,2016-04-01,2018-03-01,2018-03-01
10142,1720,18,3,0,0,0,0,1,2,0,...,"Finance & Insurance, Financial Advisor",1482.0,2018-08-01,2021-03-01,2,Other Reason (see notes),2021-03-01,2018-08-01,2021-03-01,2021-03-01
18588,1911,14,4,0,0,0,0,2,0,0,...,"Construction, Electrician",,2021-07-01,2022-07-01,2,,NaT,2021-07-01,2022-07-01,2022-07-01
12264,2523,21,3,1,0,0,1,1,3,0,...,"Interior, Interior Designer",2943.0,2018-01-01,2020-01-01,2,Member Transferred BNI Chapters,2020-01-01,2018-01-01,2020-02-01,2020-01-01
9979,3055,32,4,0,1,0,0,3,0,5,...,"Advertising & Marketing, Printer",2212.0,2013-12-01,2021-06-01,2,Did Not Renew - No reason Given,2021-06-01,2021-08-01,2022-08-01,2022-08-01
876,147,11,4,1,0,0,0,0,3,1,...,"Legal, Lawyer: Wills & Estates",1811.0,2015-05-01,2018-06-01,4,,2018-06-01,2018-09-01,2022-05-01,2022-05-01
6984,2611,15,3,0,0,0,0,0,1,0,...,"Mortgage, Residential Mortgage",,2017-05-01,2018-05-01,4,No Reason Entered,2019-07-15,2017-08-01,2018-11-01,2018-08-01
8531,1380,10,3,0,0,0,1,1,2,1,...,"Signs, Sign Company",,2013-03-01,2019-10-01,2,Other Reason (see notes),2019-08-05,2009-05-01,2013-03-01,2010-12-01


In [24]:
cond1 = df_master_with_transfers["palms_date"] >= df_master_with_transfers["join_range_A"]
cond2 = df_master_with_transfers["palms_date"] < df_master_with_transfers["join_range_B"]
df_master_with_transfers = df_master_with_transfers.loc[cond1 & cond2]

print("Corrected shape:", df_master_with_transfers.shape[0])
print("Initial shape:", transfers_shape)

Corrected shape: 8213
Initial shape: 8374


In [25]:
df_master_with_transfers["check_dates"] = df_master_with_transfers["renewal_date"] == \
    df_master_with_transfers["correct_renewal_date"]
df_master_with_transfers["check_dates"].value_counts()

True     6379
False    1834
Name: check_dates, dtype: int64

Looks like there is a bit of data loss but this method of getting the correct renewal date is definitelly more accurate than the previous merge. There are **over 1800 records** where the ```renewal_date``` is incorrect and should be substituted with ```correct_renewal_date```.

In [26]:
df_master_with_transfers.loc[df_master_with_transfers["check_dates"] == False, "renewal_date"] == None
df_master_with_transfers["renewal_date"].fillna(df_check_transfers["correct_renewal_date"], inplace=True)

In [27]:
df_master_with_transfers["check_join_dates"] = df_master_with_transfers["join_date"] == \
    df_master_with_transfers["join_range_A"]
df_master_with_transfers["check_join_dates"].value_counts()

True     6459
False    1754
Name: check_join_dates, dtype: int64

Similar case for ```join_date```.

In [28]:
df_master_with_transfers.loc[df_master_with_transfers["check_join_dates"] == False, "join_date"] == None
df_master_with_transfers["join_date"].fillna(df_check_transfers["join_range_A"], inplace=True)

Re-create ```df_master_clean``` by concatenating ```df_master_with_transfers``` and ```df_master_no_transfers```.

In [29]:
df_master_with_transfers = df_master_with_transfers[df_master_no_transfers.columns]
df_master_clean = pd.concat([df_master_no_transfers, df_master_with_transfers], axis=0)
df_master_clean.shape

(38551, 23)

In [30]:
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date
63,447,9,4,0,0,0,0,1,1,0,...,0,0,2015-01-01,"Finance & Insurance, Financial Investments",,1998-05-01,2022-07-01,1,,NaT
65,1366,9,4,0,0,0,0,4,0,0,...,4975,0,2015-01-01,"Advertising & Marketing, Sign Company",,2002-09-01,2022-06-01,1,,NaT
67,1197,9,4,0,0,0,0,2,3,0,...,21126,0,2015-01-01,"Coach, Business Coach",,2006-11-01,2018-02-01,1,No Reason Entered,2018-01-11
40,1035,9,4,0,0,0,0,0,7,0,...,500,0,2015-01-01,"Insurance, Property & Casualty Insurance",1366.0,2007-07-01,2015-05-01,1,No Reason Entered,2015-06-04
139,3035,12,1,3,0,0,0,0,1,0,...,125,0,2015-01-01,"Insurance, Group Benefits Consultant",,2008-04-01,2015-07-01,1,,2015-02-07


## Get relative renewal date for data aggregation

In [31]:
df_master_clean = df_master_clean.merge(df_temp, how="left", on=["user_ID", "chapter_ID"])
df_master_clean.loc[df_master_clean["additional_months_cumsum"].isna(), "additional_months_cumsum"] = 0
df_master_clean["additional_months_cumsum"] = df_master_clean["additional_months_cumsum"].astype(int)
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date,additional_months_cumsum
0,447,9,4,0,0,0,0,1,1,0,...,0,2015-01-01,"Finance & Insurance, Financial Investments",,1998-05-01,2022-07-01,1,,NaT,0
1,1366,9,4,0,0,0,0,4,0,0,...,0,2015-01-01,"Advertising & Marketing, Sign Company",,2002-09-01,2022-06-01,1,,NaT,0
2,1197,9,4,0,0,0,0,2,3,0,...,0,2015-01-01,"Coach, Business Coach",,2006-11-01,2018-02-01,1,No Reason Entered,2018-01-11,0
3,1035,9,4,0,0,0,0,0,7,0,...,0,2015-01-01,"Insurance, Property & Casualty Insurance",1366.0,2007-07-01,2015-05-01,1,No Reason Entered,2015-06-04,0
4,3035,12,1,3,0,0,0,0,1,0,...,0,2015-01-01,"Insurance, Group Benefits Consultant",,2008-04-01,2015-07-01,1,,2015-02-07,0


In [32]:
df_master_clean["year_of_membership"] = (df_master_clean["palms_date"] - df_master_clean["join_date"]) \
    / np.timedelta64(1, 'M') + df_master_clean["additional_months_cumsum"]
df_master_clean["year_of_membership"] = df_master_clean["year_of_membership"] // 12
df_master_clean["months_to_renewal"] = (df_master_clean["renewal_date"] - df_master_clean["palms_date"]) \
    / np.timedelta64(1, 'M')
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"].round().astype(int)
df_master_clean["years_to_renewal"] = (df_master_clean["months_to_renewal"] - 1) // 12
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"] % 12

# Substitute "months_to_renewal" value 0 with 12 for ease of aggregation later on
df_master_clean.loc[df_master_clean["months_to_renewal"] == 0, "months_to_renewal"] = 12

for index, row in df_master_clean.iterrows():
    df_master_clean.at[index, 'relative_renewal_date'] = row['renewal_date'] - pd.DateOffset(years=row['years_to_renewal'])

# Found a few instances where the "relative_renewal_date" would be subtracted incorrectly and lacking one day to be correct
df_master_clean.loc[df_master_clean["relative_renewal_date"].dt.day != 1, "relative_renewal_date"] = \
    df_master_clean.loc[df_master_clean["relative_renewal_date"].dt.day != 1, "relative_renewal_date"] + pd.DateOffset(days=1)
    
df_master_clean.drop(["years_to_renewal", "additional_months_cumsum"], axis=1, inplace=True)
df_master_clean.sample(10, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
7746,2622,19,4,0,0,0,0,0,5,0,...,"Moving & Storage, Custom Storage Solutions",1748.0,2016-11-01,2018-11-01,1,Not Enough Referrals,2018-11-13,0.0,10,2017-11-01
22497,1822,6,4,0,0,0,0,1,6,1,...,"Construction, Roofing & Gutters",1073.0,2018-04-01,2022-10-01,1,,NaT,2.0,6,2020-10-01
12546,114,17,1,1,0,0,2,1,1,0,...,"Insurance, Employee Benefits-Group",62.0,2014-12-01,2018-06-01,1,Takes Too Much Time,2018-05-01,3.0,3,2018-06-01
26965,2394,23,3,0,0,0,1,0,2,1,...,"Legal & Accounting, Business Law",,2019-03-01,2022-09-01,1,,NaT,2.0,6,2021-09-01
11684,452,21,1,0,1,0,0,0,1,0,...,"Legal, Lawyer Real Estate",2149.0,2017-10-01,2018-10-01,1,Did Not Renew - No reason Given,2018-10-01,0.0,10,2018-10-01
11442,998,6,3,0,0,0,0,2,2,0,...,"Construction, Flooring",100.0,2015-04-01,2021-09-01,1,Did Not Renew - No reason Given,2021-09-01,2.0,9,2018-09-01
33300,812,26,3,0,1,0,1,1,4,0,...,"Medical, Chiropractor",,2016-04-01,2018-10-01,2,Other Reason (see notes),2018-01-18,1.0,11,2018-10-01
23018,1512,19,3,0,0,1,0,5,2,2,...,"Telecommunications, Mobile Telecommunications",810.0,2019-06-01,2021-11-01,1,Changed Job,2021-03-04,0.0,6,2020-11-01
29703,2532,12,2,0,0,3,0,0,5,3,...,"Construction, Electrician",1730.0,2020-08-01,2022-08-01,1,,NaT,1.0,11,2022-08-01
29176,680,12,4,0,0,0,0,2,10,2,...,"Consulting, Professional Organizer",2415.0,2019-10-01,2022-10-01,1,,NaT,1.0,2,2021-10-01


## Aggregate 9-months data
Ensure that each groupby sum is aggregated for 9 months, not less.

In [33]:
df_pre_agg = df_master_clean.copy()

df_pre_agg["control_count"] = 1
df_pre_agg = df_pre_agg.loc[df_pre_agg["months_to_renewal"] >= 4]

df_pre_agg.drop_duplicates(subset=df_pre_agg.columns[:16], inplace=True)

df_pre_agg.drop(["industry",
                 "sponsor_ID",
                 "re_joins",
                 "join_date",
                 "renewal_date",
                 "palms_date",
                 "drop_date",
                 "months_to_renewal"], axis=1, inplace=True)

df_pre_agg.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,reason,year_of_membership,relative_renewal_date,control_count
16513,1906,11,4,0,0,0,0,2,1,0,1,0,5,536,7,,0.0,2020-01-01,1
29072,2053,15,4,0,0,1,0,0,4,2,3,0,4,5187,6,,3.0,2022-07-01,1
8975,2590,2,3,0,0,0,1,0,0,2,2,0,2,0,0,,1.0,2017-12-01,1
34523,3055,32,3,1,0,0,0,2,1,4,3,0,1,0,1,Did Not Renew - No reason Given,4.0,2019-06-01,1
5668,2933,4,4,0,0,0,0,0,2,0,2,1,2,0,1,,0.0,2017-03-01,1
29258,2316,17,4,0,0,0,0,0,0,0,2,0,4,0,5,,0.0,2022-08-01,1
31971,485,19,3,1,0,0,0,0,4,0,1,0,2,627,6,Scheduling Conflicts,1.0,2017-05-01,1
3357,1708,19,2,1,0,0,0,1,2,0,2,0,11,518,1,Scheduling Conflicts,0.0,2016-11-01,1
13493,2014,24,5,0,0,0,0,0,4,0,5,0,3,2971,6,Did Not Renew - No reason Given,0.0,2018-09-01,1
33859,938,4,4,0,0,0,0,2,3,1,3,0,1,0,3,,0.0,2019-02-01,1


In [34]:
df_agg = df_pre_agg.copy()
df_agg = df_agg.groupby(["user_ID", "chapter_ID", "relative_renewal_date"]).sum()
df_agg.reset_index(drop=False, inplace=True)
df_agg = df_agg.loc[df_agg["control_count"] >= 9]
df_agg["year_of_membership"] = (df_agg["year_of_membership"]/9).round().astype(int)
df_agg.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
375,273,31,2018-12-01,31,4,0,0,2,8,18,4,22,2,40,5688,25,1,9
2288,1665,32,2020-08-01,37,0,0,0,0,11,35,0,12,0,17,26113,32,1,9
4281,3046,32,2017-11-01,37,0,0,0,0,21,38,3,14,14,45,81061,0,6,9
2295,1667,24,2020-12-01,33,2,1,0,0,9,31,12,35,5,41,23163,56,2,9
627,484,10,2021-11-01,34,3,1,0,0,14,0,8,9,1,31,1483,61,1,9
1939,1427,18,2016-12-01,33,1,0,0,4,18,52,14,30,4,65,5443,10,0,9
1766,1314,6,2020-10-01,38,0,0,0,0,5,37,2,13,3,42,33068,34,3,9
315,235,26,2017-07-01,37,0,0,0,1,4,5,0,8,2,32,1242,39,0,9
1270,961,10,2020-12-01,32,2,1,0,0,6,17,6,21,1,56,19446,18,0,9
1526,1148,26,2021-07-01,36,0,0,0,1,9,15,19,33,0,28,6081,47,9,9


In [35]:
df_agg.shape

(2423, 18)

In [36]:
df_agg2 = df_pre_agg.copy()
df_agg2 = df_agg2.groupby(["user_ID", "relative_renewal_date"]).sum()
df_agg2.reset_index(drop=False, inplace=True)
df_agg2 = df_agg2.loc[df_agg2["control_count"] >= 9]
df_agg2["year_of_membership"] = (df_agg2["year_of_membership"]/9).round().astype(int)
df_agg2["chapter_ID"] = -1
df_agg2.sample(10, random_state=13)

Unnamed: 0,user_ID,relative_renewal_date,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
962,717,2021-04-01,-1,38,0,0,1,0,7,30,11,85,0,40,83316,33,3,9
22,10,2018-10-01,-1,30,4,1,0,2,3,17,3,45,0,26,3648,57,1,9
1943,1440,2018-02-01,-1,30,8,0,0,1,2,8,6,15,2,29,2738,26,0,9
1239,938,2019-02-01,-1,36,0,1,0,1,3,14,2,31,3,23,550,59,0,9
193,142,2021-09-01,-1,37,0,0,0,0,3,41,39,80,11,49,5579,64,1,9
1881,1394,2021-03-01,-1,38,0,0,0,0,19,11,2,6,1,64,9348,219,7,9
3291,2372,2019-06-01,-1,34,0,0,0,5,11,22,14,24,1,31,39922,19,4,9
1320,1000,2016-05-01,-1,29,2,0,0,3,6,4,11,12,2,15,10879,18,2,9
3561,2559,2017-12-01,-1,36,1,0,0,0,5,14,25,38,1,34,446,66,1,9
2902,2092,2017-08-01,-1,32,1,1,0,3,1,9,4,14,12,41,1285,22,5,9


In [37]:
df_agg = pd.concat([df_agg, df_agg2])
del df_agg2
df_agg.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
3938,2835,-1,2020-04-01,36,2,0,0,0,18,53,29,35,7,88,19884,7,0,9
3845,2770,32,2022-02-01,39,0,0,0,0,17,29,8,12,1,31,12113,16,9,9
295,227,-1,2021-07-01,28,7,0,0,1,2,14,15,32,0,85,6360,75,0,9
3474,2482,28,2021-07-01,31,2,0,0,0,4,2,3,8,0,25,0,25,0,9
3352,2415,-1,2017-04-01,35,1,0,0,0,12,13,5,16,2,13,29902,10,2,9
3798,2746,-1,2017-07-01,35,0,0,0,2,5,24,6,7,3,65,14820,48,0,9
1170,887,-1,2016-03-01,37,0,0,0,2,7,8,4,14,1,33,11062,4,2,9
1165,883,-1,2021-01-01,34,0,1,1,2,9,11,1,2,3,34,27410,39,3,9
2132,1554,8,2021-07-01,30,1,1,2,1,3,24,4,9,6,37,48052,40,1,9
1201,897,17,2016-10-01,34,1,0,0,3,6,32,15,44,5,13,20238,34,3,9


In [38]:
df_agg.shape

(4852, 18)

In [39]:
df_agg.drop_duplicates(subset=["user_ID", "relative_renewal_date"], inplace=True)
df_agg.shape

(2429, 18)

### Get member transfer chapters' (the one they joined after transfer)

In [40]:
df_agg.loc[df_agg["chapter_ID"] == -1]

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count
1268,969,-1,2017-04-01,36,0,0,0,2,12,18,1,8,10,63,24515,69,0,10
1768,1324,-1,2020-02-01,22,6,0,0,2,11,17,2,30,0,42,134932,23,0,9
1931,1427,-1,2017-12-01,31,1,0,0,7,15,33,3,38,6,74,5257,17,1,11
4074,2915,-1,2020-06-01,35,1,0,0,2,5,44,17,14,7,102,41144,50,1,10
4230,3024,-1,2019-03-01,34,2,4,0,1,2,25,10,20,13,24,5900,182,2,10
4277,3047,-1,2021-05-01,30,2,0,3,0,9,41,4,9,8,32,28134,24,6,10


In [41]:
df_agg.loc[df_agg["chapter_ID"] == -1, "relative_renewal_date"] - pd.DateOffset(months=5)

1268   2016-11-01
1768   2019-09-01
1931   2017-07-01
4074   2020-01-01
4230   2018-10-01
4277   2020-12-01
Name: relative_renewal_date, dtype: datetime64[ns]

In [42]:
df_agg["chapter_transfer"] = 0
df_agg.loc[df_agg["chapter_ID"] == -1, "chapter_transfer"] = 1

In [43]:
for index, row in df_agg.loc[df_agg["chapter_ID"] == -1].iterrows():
    get_date = row["relative_renewal_date"] - pd.DateOffset(months=5)
    chapter_ID = df_master_clean.loc[(df_master_clean["user_ID"] == row["user_ID"]) & (df_master_clean["palms_date"] == get_date), "chapter_ID"]
    df_agg.loc[index, "chapter_ID"] = chapter_ID.values[0]

## Feature Engineering
### 1. Chapter size

In [44]:
df_master_clean.sample(10, random_state=13)
df_chapter_count = df_master_clean.groupby(["palms_date", "chapter_ID"])[["user_ID"]].count()
df_chapter_count.reset_index(level=["palms_date", "chapter_ID"], inplace=True)
df_chapter_count.rename({"user_ID": "chapter_size"}, axis=1, inplace=True)
df_chapter_count.sample(10, random_state=13)

Unnamed: 0,palms_date,chapter_ID,chapter_size
1016,2020-03-01,32,35
749,2019-01-01,14,24
296,2016-11-01,11,30
1299,2021-05-01,23,23
807,2019-04-01,19,34
189,2016-04-01,9,33
448,2017-08-01,31,23
1261,2021-03-01,31,19
620,2018-06-01,12,20
1097,2020-08-01,6,31


In [45]:
# Get "temp_date" for merging new features
df_agg["temp_date"] = df_agg["relative_renewal_date"] - pd.DateOffset(months=4)
df_agg.shape

(2429, 20)

In [46]:
df_agg.loc[df_agg["temp_date"].dt.day != 1]

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count,chapter_transfer,temp_date


In [47]:
df_agg = df_agg.merge(df_chapter_count,
                      how="left",
                      left_on=["chapter_ID", "temp_date"],
                      right_on=["chapter_ID", "palms_date"])

df_agg.drop(["palms_date"], axis=1, inplace=True)
df_agg.shape

(2429, 21)

In [48]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2429 entries, 0 to 2428
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   user_ID                2429 non-null   int64         
 1   chapter_ID             2429 non-null   int64         
 2   relative_renewal_date  2429 non-null   datetime64[ns]
 3   P                      2429 non-null   int64         
 4   A                      2429 non-null   int64         
 5   L                      2429 non-null   int64         
 6   M                      2429 non-null   int64         
 7   S                      2429 non-null   int64         
 8   RGI                    2429 non-null   int64         
 9   RGO                    2429 non-null   int64         
 10  RRI                    2429 non-null   int64         
 11  RRO                    2429 non-null   int64         
 12  V                      2429 non-null   int64         
 13  1-2

### 2. Chapter retention rate

In [49]:
df_dropped.head()

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637,9,Left Company,2021-11-03
1,1185,23,Changed Job,2021-10-22
2,1225,14,Takes Too Much Time,2021-10-13
3,2138,15,Did Not Renew - No reason Given,2021-11-01
4,1610,15,Other Reason (see notes),2021-10-12


In [50]:
df_agg.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,RRO,V,1-2-1,TYFCB,CEU,year_of_membership,control_count,chapter_transfer,temp_date,chapter_size
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,32,1,37,129784,45,0,9,0,2015-12-01,32
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,22,1,26,9285,1,0,9,0,2017-01-01,34
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,36,1,20,7263,10,1,9,0,2018-01-01,21
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,19,3,28,1860,31,2,9,0,2019-01-01,22
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,47,4,30,6668,16,3,9,0,2020-01-01,26


In [51]:
df_master_clean.loc[(df_master_clean["user_ID"] == 0) & (df_master_clean["relative_renewal_date"] == "2016-04-01")]

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
1040,0,31,3,0,0,0,0,0,1,0,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,12,2016-04-01
1299,0,31,4,0,0,0,0,2,1,4,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,11,2016-04-01
1578,0,31,4,0,0,0,0,4,0,3,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,10,2016-04-01
1868,0,31,4,0,0,0,0,4,0,2,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,9,2016-04-01
2157,0,31,3,0,0,1,0,1,0,4,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,8,2016-04-01
2444,0,31,5,0,0,0,0,3,1,6,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,7,2016-04-01
2725,0,31,4,0,0,0,0,1,2,3,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,6,2016-04-01
3002,0,31,3,0,0,0,0,2,2,4,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,5,2016-04-01
3282,0,31,3,0,0,0,0,4,0,7,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,4,2016-04-01
3582,0,31,4,0,0,0,0,2,0,3,...,"Art, Art Framing and Products",410.0,2015-04-01,2017-04-01,1,Personal Reasons,2016-09-14,0.0,3,2016-04-01


In [52]:
df_agg["drop_range_A"] = df_agg["relative_renewal_date"] - pd.DateOffset(years=1)
df_agg["drop_range_B"] = df_agg["relative_renewal_date"] - pd.DateOffset(months=3)
df_agg.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,1-2-1,TYFCB,CEU,year_of_membership,control_count,chapter_transfer,temp_date,chapter_size,drop_range_A,drop_range_B
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,37,129784,45,0,9,0,2015-12-01,32,2015-04-01,2016-01-01
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,26,9285,1,0,9,0,2017-01-01,34,2016-05-01,2017-02-01
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,20,7263,10,1,9,0,2018-01-01,21,2017-05-01,2018-02-01
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,28,1860,31,2,9,0,2019-01-01,22,2018-05-01,2019-02-01
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,30,6668,16,3,9,0,2020-01-01,26,2019-05-01,2020-02-01


In [53]:
df_agg = df_agg.merge(df_chapter_count,
                      how="left",
                      left_on=["chapter_ID", "drop_range_A"],
                      right_on=["chapter_ID", "palms_date"])

df_agg.drop(["palms_date"], axis=1, inplace=True)
df_agg.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,TYFCB,CEU,year_of_membership,control_count,chapter_transfer,temp_date,chapter_size_x,drop_range_A,drop_range_B,chapter_size_y
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,129784,45,0,9,0,2015-12-01,32,2015-04-01,2016-01-01,19
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,9285,1,0,9,0,2017-01-01,34,2016-05-01,2017-02-01,32
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,7263,10,1,9,0,2018-01-01,21,2017-05-01,2018-02-01,33
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,1860,31,2,9,0,2019-01-01,22,2018-05-01,2019-02-01,23
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,6668,16,3,9,0,2020-01-01,26,2019-05-01,2020-02-01,25


In [54]:
df_agg["chapter_members_dropped"] = -1

for index, row in df_agg.iterrows():
    cond1 = (df_dropped["drop_date"] >= row["drop_range_A"]) & (df_dropped["drop_date"] <= row["drop_range_B"])
    cond2 = df_dropped["chapter_ID"] == row["chapter_ID"]
    df_agg.loc[index, "chapter_members_dropped"] = df_dropped.loc[cond1 & cond2, "user_ID"].count()

In [55]:
df_agg["chapter_retention_rate"] = (df_agg["chapter_size_y"] - df_agg["chapter_members_dropped"]) / df_agg["chapter_size_y"]
df_agg.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,year_of_membership,control_count,chapter_transfer,temp_date,chapter_size_x,drop_range_A,drop_range_B,chapter_size_y,chapter_members_dropped,chapter_retention_rate
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,0,9,0,2015-12-01,32,2015-04-01,2016-01-01,19,13,0.315789
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,0,9,0,2017-01-01,34,2016-05-01,2017-02-01,32,9,0.71875
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,1,9,0,2018-01-01,21,2017-05-01,2018-02-01,33,19,0.424242
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,2,9,0,2019-01-01,22,2018-05-01,2019-02-01,23,6,0.73913
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,3,9,0,2020-01-01,26,2019-05-01,2020-02-01,25,6,0.76


### 3. Chapter growth rate

In [56]:
df_agg["chapter_growth_rate"] = -1

for index, row in df_agg.iterrows():
    cond1 = (df_database["join_date"] >= row["drop_range_A"]) & (df_database["join_date"] <= row["drop_range_B"])
    cond2 = df_database["chapter_ID"] == row["chapter_ID"]
    df_agg.loc[index, "chapter_growth_rate"] = df_database.loc[cond1 & cond2, "user_ID"].count()

In [57]:
df_agg["chapter_growth_rate"] = (df_agg["chapter_size_y"] - df_agg["chapter_members_dropped"] + df_agg["chapter_growth_rate"]) / df_agg["chapter_size_y"]
df_agg.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,control_count,chapter_transfer,temp_date,chapter_size_x,drop_range_A,drop_range_B,chapter_size_y,chapter_members_dropped,chapter_retention_rate,chapter_growth_rate
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,9,0,2015-12-01,32,2015-04-01,2016-01-01,19,13,0.315789,1.421053
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,9,0,2017-01-01,34,2016-05-01,2017-02-01,32,9,0.71875,1.09375
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,9,0,2018-01-01,21,2017-05-01,2018-02-01,33,19,0.424242,0.636364
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,9,0,2019-01-01,22,2018-05-01,2019-02-01,23,6,0.73913,1.217391
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,9,0,2020-01-01,26,2019-05-01,2020-02-01,25,6,0.76,1.12


In [58]:
df_agg.drop(["drop_range_A", "drop_range_B", "chapter_size_y", "chapter_members_dropped"], axis=1, inplace=True)
df_agg.rename({"chapter_size_x": "chapter_size"}, axis=1, inplace=True)

### 4. Seat popularity rate

In [59]:
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
0,447,9,4,0,0,0,0,1,1,0,...,"Finance & Insurance, Financial Investments",,1998-05-01,2022-07-01,1,,NaT,16.0,6,2015-07-01
1,1366,9,4,0,0,0,0,4,0,0,...,"Advertising & Marketing, Sign Company",,2002-09-01,2022-06-01,1,,NaT,12.0,5,2015-06-01
2,1197,9,4,0,0,0,0,2,3,0,...,"Coach, Business Coach",,2006-11-01,2018-02-01,1,No Reason Entered,2018-01-11,8.0,1,2015-02-01
3,1035,9,4,0,0,0,0,0,7,0,...,"Insurance, Property & Casualty Insurance",1366.0,2007-07-01,2015-05-01,1,No Reason Entered,2015-06-04,7.0,4,2015-05-01
4,3035,12,1,3,0,0,0,0,1,0,...,"Insurance, Group Benefits Consultant",,2008-04-01,2015-07-01,1,,2015-02-07,6.0,6,2015-07-01


In [60]:
df_agg = df_agg.merge(df_master_clean[["user_ID", "chapter_ID", "industry"]].drop_duplicates(),
                      how="left",
                      on=["user_ID", "chapter_ID"])
df_agg.shape

(2429, 24)

In [61]:
df_chapter_count = df_master_clean.groupby("palms_date")[["chapter_ID"]].nunique()
df_chapter_count.rename({"chapter_ID": "chapter_count"}, axis=1, inplace=True)
df_chapter_count.head()

Unnamed: 0_level_0,chapter_count
palms_date,Unnamed: 1_level_1
2015-01-01,12
2015-02-01,12
2015-03-01,12
2015-04-01,12
2015-05-01,12


In [62]:
df_agg = df_agg.merge(df_chapter_count, how="left", left_on="temp_date", right_index=True)
df_agg.shape

(2429, 25)

In [63]:
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,industry,sponsor_ID,join_date,renewal_date,re_joins,reason,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
0,447,9,4,0,0,0,0,1,1,0,...,"Finance & Insurance, Financial Investments",,1998-05-01,2022-07-01,1,,NaT,16.0,6,2015-07-01
1,1366,9,4,0,0,0,0,4,0,0,...,"Advertising & Marketing, Sign Company",,2002-09-01,2022-06-01,1,,NaT,12.0,5,2015-06-01
2,1197,9,4,0,0,0,0,2,3,0,...,"Coach, Business Coach",,2006-11-01,2018-02-01,1,No Reason Entered,2018-01-11,8.0,1,2015-02-01
3,1035,9,4,0,0,0,0,0,7,0,...,"Insurance, Property & Casualty Insurance",1366.0,2007-07-01,2015-05-01,1,No Reason Entered,2015-06-04,7.0,4,2015-05-01
4,3035,12,1,3,0,0,0,0,1,0,...,"Insurance, Group Benefits Consultant",,2008-04-01,2015-07-01,1,,2015-02-07,6.0,6,2015-07-01


In [64]:
df_agg["seat_popularity_rate"] = -1

for index, row in df_agg.iterrows():
    cond1 = df_master_clean["industry"] == row["industry"]
    cond2 = df_master_clean["palms_date"] == row["temp_date"]
    df_agg.loc[index, "seat_popularity_rate"] = df_master_clean.loc[cond1 & cond2, "user_ID"].count()

df_agg["seat_popularity_rate"] = df_agg["seat_popularity_rate"] / df_agg["chapter_count"]
df_agg.sample(10, random_state=23)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,year_of_membership,control_count,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,chapter_count,seat_popularity_rate
287,427,29,2021-06-01,33,3,0,0,0,16,17,...,0,9,0,2021-02-01,25,0.75,1.041667,"Health & Wellness, Massage Therapist",22,0.363636
415,578,15,2019-12-01,33,3,0,0,1,13,47,...,3,9,0,2019-08-01,35,0.586207,1.482759,"Trades, Builder-Residential",18,0.055556
1403,1809,32,2016-07-01,35,2,0,0,2,16,27,...,3,9,0,2016-03-01,39,0.619048,0.928571,"Accounting, Chartered Professional Accountant ...",15,0.066667
1924,2467,10,2020-12-01,27,4,1,1,3,7,6,...,3,9,0,2020-08-01,21,0.777778,1.333333,"Trades, Windows",20,0.05
1049,1412,9,2019-08-01,36,0,0,0,1,11,14,...,1,9,0,2019-04-01,37,0.815789,1.105263,"Marketing, Digital Marketing",18,0.333333
53,90,17,2016-09-01,38,0,0,0,0,34,15,...,2,9,0,2016-05-01,34,0.634146,0.853659,"z(Archived Duplicate) Alternative Medicine, Nu...",15,0.066667
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,1,9,0,2018-01-01,21,0.424242,0.636364,"Health & Wellness, Chiropractor",19,0.210526
2069,2676,26,2017-11-01,36,0,0,0,2,9,24,...,0,9,0,2017-07-01,33,0.685714,1.171429,"Accounting, Bookkeeper",17,0.529412
1643,2128,12,2022-01-01,32,1,3,0,1,23,44,...,2,9,0,2021-09-01,36,0.9375,1.1875,"Retail, Florist",22,0.136364
1016,1381,19,2017-04-01,32,1,1,0,3,4,8,...,0,9,0,2016-12-01,30,0.62963,1.259259,"Trades, Electrician",15,0.333333


In [65]:
df_agg.shape

(2429, 26)

In [66]:
df_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2429 entries, 0 to 2428
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   user_ID                 2429 non-null   int64         
 1   chapter_ID              2429 non-null   int64         
 2   relative_renewal_date   2429 non-null   datetime64[ns]
 3   P                       2429 non-null   int64         
 4   A                       2429 non-null   int64         
 5   L                       2429 non-null   int64         
 6   M                       2429 non-null   int64         
 7   S                       2429 non-null   int64         
 8   RGI                     2429 non-null   int64         
 9   RGO                     2429 non-null   int64         
 10  RRI                     2429 non-null   int64         
 11  RRO                     2429 non-null   int64         
 12  V                       2429 non-null   int64   

In [67]:
df_agg.loc[df_agg["industry"].isna()].shape

(2, 26)

In [68]:
# Fixing "industry" for the users which are lacking it
for index, row in df_agg.loc[df_agg["industry"].isna()].iterrows():
    user_industry = df_database.loc[df_database["user_ID"] == row["user_ID"], "industry"].values[-1]
    df_agg.loc[index, "industry"] = user_industry
    cond1 = df_master_clean["industry"] == user_industry
    cond2 = df_master_clean["palms_date"] == row["temp_date"]
    df_agg.loc[index, "seat_popularity_rate"] = df_master_clean.loc[cond1 & cond2, "user_ID"].count()
    df_agg.loc[index, "seat_popularity_rate"] = df_agg.loc[index, "seat_popularity_rate"] / df_agg.loc[index, "chapter_count"]

In [69]:
df_agg.sample(10, random_state=23)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,year_of_membership,control_count,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,chapter_count,seat_popularity_rate
287,427,29,2021-06-01,33,3,0,0,0,16,17,...,0,9,0,2021-02-01,25,0.75,1.041667,"Health & Wellness, Massage Therapist",22,0.363636
415,578,15,2019-12-01,33,3,0,0,1,13,47,...,3,9,0,2019-08-01,35,0.586207,1.482759,"Trades, Builder-Residential",18,0.055556
1403,1809,32,2016-07-01,35,2,0,0,2,16,27,...,3,9,0,2016-03-01,39,0.619048,0.928571,"Accounting, Chartered Professional Accountant ...",15,0.066667
1924,2467,10,2020-12-01,27,4,1,1,3,7,6,...,3,9,0,2020-08-01,21,0.777778,1.333333,"Trades, Windows",20,0.05
1049,1412,9,2019-08-01,36,0,0,0,1,11,14,...,1,9,0,2019-04-01,37,0.815789,1.105263,"Marketing, Digital Marketing",18,0.333333
53,90,17,2016-09-01,38,0,0,0,0,34,15,...,2,9,0,2016-05-01,34,0.634146,0.853659,"z(Archived Duplicate) Alternative Medicine, Nu...",15,0.066667
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,1,9,0,2018-01-01,21,0.424242,0.636364,"Health & Wellness, Chiropractor",19,0.210526
2069,2676,26,2017-11-01,36,0,0,0,2,9,24,...,0,9,0,2017-07-01,33,0.685714,1.171429,"Accounting, Bookkeeper",17,0.529412
1643,2128,12,2022-01-01,32,1,3,0,1,23,44,...,2,9,0,2021-09-01,36,0.9375,1.1875,"Retail, Florist",22,0.136364
1016,1381,19,2017-04-01,32,1,1,0,3,4,8,...,0,9,0,2016-12-01,30,0.62963,1.259259,"Trades, Electrician",15,0.333333


In [70]:
df_agg.describe().iloc[:,-5:]

Unnamed: 0,chapter_size,chapter_retention_rate,chapter_growth_rate,chapter_count,seat_popularity_rate
count,2429.0,2429.0,2429.0,2429.0,2429.0
mean,29.14121,0.658677,1.183383,18.228077,0.268144
std,6.638853,0.144879,0.346452,2.558995,0.216183
min,10.0,0.043478,0.575758,12.0,0.0
25%,24.0,0.576923,1.0,17.0,0.090909
50%,30.0,0.675676,1.107143,18.0,0.2
75%,34.0,0.75,1.269231,20.0,0.4
max,44.0,1.0,2.875,22.0,1.0


In [71]:
df_agg.drop(["chapter_count"], axis=1, inplace=True)

## Label records
1. Join ```df_agg``` with ```df_dropped```
2. Double-check if the ```drop_date``` is alligned with the last record in ```df_master_cleaned```

then label accordingly.

In [72]:
df_dropped.head()

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637,9,Left Company,2021-11-03
1,1185,23,Changed Job,2021-10-22
2,1225,14,Takes Too Much Time,2021-10-13
3,2138,15,Did Not Renew - No reason Given,2021-11-01
4,1610,15,Other Reason (see notes),2021-10-12


In [73]:
df_final = df_agg.merge(df_dropped, how="left", on=["user_ID", "chapter_ID"])
df_final.sort_values(["user_ID", "relative_renewal_date"], inplace=True)
df_final.shape

(2569, 27)

In [74]:
df_final.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,control_count,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,seat_popularity_rate,reason,drop_date
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,9,0,2015-12-01,32,0.315789,1.421053,"Art, Art Framing and Products",0.076923,Personal Reasons,2016-09-14
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,9,0,2017-01-01,34,0.71875,1.09375,"Health & Wellness, Chiropractor",0.266667,,NaT
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,9,0,2018-01-01,21,0.424242,0.636364,"Health & Wellness, Chiropractor",0.210526,,NaT
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,9,0,2019-01-01,22,0.73913,1.217391,"Health & Wellness, Chiropractor",0.263158,,NaT
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,9,0,2020-01-01,26,0.76,1.12,"Health & Wellness, Chiropractor",0.277778,,NaT


In [75]:
df_final.drop("reason", axis=1, inplace=True)
df_final.drop_duplicates(df_final.columns[:16], keep="last", inplace=True)
df_final.shape

(2429, 26)

In [76]:
final_month = df_master_clean.groupby(["user_ID", "chapter_ID"])[["palms_date"]].max()
final_month.reset_index(["user_ID", "chapter_ID"], inplace=True)
final_month.rename({"palms_date": "final_palms_date"}, axis=1, inplace=True)
final_month.head()

Unnamed: 0,user_ID,chapter_ID,final_palms_date
0,0,31,2016-09-01
1,2,18,2021-10-01
2,3,11,2017-09-01
3,4,18,2021-10-01
4,6,4,2017-11-01


In [77]:
df_final = df_final.merge(final_month, how="left", on=["user_ID", "chapter_ID"])
df_final.drop_duplicates(df_final.columns[:16], keep="last", inplace=True)
df_final.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,control_count,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,seat_popularity_rate,drop_date,final_palms_date
521,717,6,2021-04-01,38,0,0,1,0,7,30,...,9,0,2020-12-01,31,0.878788,0.939394,"Legal & Accounting, Legal & Accounting (Other)",0.2,NaT,2021-10-01
14,10,19,2018-10-01,30,4,1,0,2,3,17,...,9,0,2018-06-01,32,0.46875,1.09375,"Trades, Painter",0.611111,2018-09-18,2018-09-01
1071,1440,4,2018-02-01,30,8,0,0,1,2,8,...,9,0,2017-10-01,20,1.0,1.5,"Health and Wellness, Fitness Trainer",0.5,NaT,2017-11-01
671,938,4,2019-02-01,36,0,1,0,1,3,14,...,9,0,2018-10-01,16,1.0,1.47619,"Accounting, Accountant",0.333333,NaT,2019-01-01
99,142,24,2021-09-01,37,0,0,0,0,3,41,...,9,0,2021-05-01,19,0.772727,0.909091,"Health & Wellness, Pharmacist",0.045455,NaT,2021-10-01
1033,1394,11,2021-03-01,38,0,0,0,0,19,11,...,9,0,2020-11-01,23,0.809524,1.142857,"Training & Coaching, Business Training/Coach",0.45,NaT,2021-10-01
1835,2372,32,2019-06-01,34,0,0,0,5,11,22,...,9,0,2019-02-01,34,0.636364,1.090909,"Construction, HVAC - Heating & Air",0.055556,NaT,2021-10-01
717,1000,11,2016-05-01,29,2,0,0,3,6,4,...,9,0,2016-01-01,19,0.4,0.88,"Car & Motorcycle, Auto/Car Repair",0.076923,NaT,2021-10-01
1981,2559,25,2017-12-01,36,1,0,0,0,5,14,...,9,0,2017-08-01,26,0.727273,1.136364,"Signs, Sign Company",0.166667,2018-06-14,2018-06-01
1621,2092,10,2017-08-01,32,1,1,0,3,1,9,...,9,0,2017-04-01,20,0.583333,0.833333,"Marketing, Market Branding",0.066667,2017-08-01,2017-08-01


In [78]:
# Remove data which cannot be labelled
df_final = df_final.loc[df_final["relative_renewal_date"] <= "2021-09-01"]
df_final.shape

(2209, 27)

In [79]:
df_final[["user_ID", "chapter_ID", "relative_renewal_date", "drop_date", "final_palms_date"]].sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,drop_date,final_palms_date
22,31,10,2019-08-01,2020-02-14,2020-02-01
1693,2180,6,2019-12-01,NaT,2021-10-01
1589,2017,10,2017-06-01,2017-03-09,2017-03-01
1907,2445,6,2019-12-01,2019-12-01,2019-12-01
1239,1631,23,2020-05-01,2016-02-02,2020-01-01
1568,1993,24,2021-02-01,NaT,2021-10-01
2301,2935,32,2019-10-01,NaT,2021-10-01
1064,1422,14,2019-02-01,2018-12-03,2018-12-01
239,356,23,2018-05-01,2021-01-15,2021-03-01
1063,1421,9,2018-03-01,2018-03-01,2018-03-01


In [80]:
df_final["date_diff"] = df_final["final_palms_date"] - df_final["drop_date"]
df_final.head()

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,seat_popularity_rate,drop_date,final_palms_date,date_diff
0,0,31,2016-04-01,33,0,0,1,0,21,7,...,0,2015-12-01,32,0.315789,1.421053,"Art, Art Framing and Products",0.076923,2016-09-14,2016-09-01,-13 days
1,2,18,2017-05-01,33,2,2,0,0,6,17,...,0,2017-01-01,34,0.71875,1.09375,"Health & Wellness, Chiropractor",0.266667,NaT,2021-10-01,NaT
2,2,18,2018-05-01,35,1,0,0,2,6,24,...,0,2018-01-01,21,0.424242,0.636364,"Health & Wellness, Chiropractor",0.210526,NaT,2021-10-01,NaT
3,2,18,2019-05-01,36,0,0,0,1,9,8,...,0,2019-01-01,22,0.73913,1.217391,"Health & Wellness, Chiropractor",0.263158,NaT,2021-10-01,NaT
4,2,18,2020-05-01,33,1,0,0,3,10,13,...,0,2020-01-01,26,0.76,1.12,"Health & Wellness, Chiropractor",0.277778,NaT,2021-10-01,NaT


In [81]:
df_final.loc[df_final["date_diff"] > pd.Timedelta(days=31)].shape

(155, 28)

In [82]:
df_final.loc[df_final["drop_date"].isna()].shape

(841, 28)

In [83]:
df_final.loc[df_final["final_palms_date"].isna()].shape

(2, 28)

In [84]:
df_final.loc[df_final["final_palms_date"].isna()]

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,seat_popularity_rate,drop_date,final_palms_date,date_diff
2275,2909,19,2018-06-01,35,1,0,0,1,6,15,...,0,2018-02-01,30,0.612903,1.064516,"Trades, Electrical Contractor",0.263158,NaT,NaT,NaT
2360,3015,9,2017-05-01,34,1,0,0,3,16,25,...,0,2017-01-01,35,0.741935,1.064516,"Food & Beverage, Restaurant",0.066667,NaT,NaT,NaT


In [85]:
for index, row in df_final.loc[df_final["final_palms_date"].isna()].iterrows():
    df_final.loc[index, "final_palms_date"] = df_master_clean.loc[df_master_clean["user_ID"] == row["user_ID"], "palms_date"].max()

In [86]:
df_final.loc[df_final["final_palms_date"].isna()].shape

(0, 28)

In [87]:
df_final.loc[df_final["date_diff"] > pd.Timedelta(days=31)].sample(5, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,P,A,L,M,S,RGI,RGO,...,chapter_transfer,temp_date,chapter_size,chapter_retention_rate,chapter_growth_rate,industry,seat_popularity_rate,drop_date,final_palms_date,date_diff
2311,2948,23,2019-11-01,34,0,0,0,3,9,22,...,0,2019-07-01,34,0.884615,1.5,"Photography, Photographer",0.222222,2015-11-01,2021-10-01,2161 days
2153,2797,26,2020-10-01,27,1,0,6,2,9,9,...,0,2020-06-01,32,0.677419,1.193548,"Coach, Business Coach",0.15,2012-06-25,2020-12-01,3081 days
1451,1870,10,2018-12-01,34,1,0,0,2,9,28,...,0,2018-08-01,24,0.777778,1.333333,"Consulting, Energy Consultant",0.055556,2018-12-06,2021-09-01,1000 days
2378,3037,24,2018-06-01,30,2,0,4,1,11,1,...,0,2018-02-01,29,0.9,2.55,"Trades, Roofing",0.210526,2018-04-04,2021-02-01,1034 days
1760,2260,31,2019-11-01,38,0,0,0,0,25,20,...,0,2019-07-01,27,0.68,1.2,"Telecommunications, Telecommunications Services",0.333333,2014-07-01,2021-04-01,2466 days


It looks as though ```final_palms_date``` (final available PALMS data record per user) is more reliable than the ```drop_date``` and so moving forward the former will be used to determine labels:

- "1" if a member was dropped 
- "0" if the member will be renewing

In [88]:
df_final["wont_renew"] = 0
cond = df_final["relative_renewal_date"] + pd.DateOffset(months=1) >= df_final["final_palms_date"]
df_final.loc[cond, "wont_renew"] = 1
df_final.loc[cond, "wont_renew"].shape

(680,)

In [89]:
df_final[["user_ID", "chapter_ID", "relative_renewal_date", "final_palms_date", "wont_renew"]].sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,relative_renewal_date,final_palms_date,wont_renew
22,31,10,2019-08-01,2020-02-01,0
1693,2180,6,2019-12-01,2021-10-01,0
1589,2017,10,2017-06-01,2017-03-01,1
1907,2445,6,2019-12-01,2019-12-01,1
1239,1631,23,2020-05-01,2020-01-01,1
1568,1993,24,2021-02-01,2021-10-01,0
2301,2935,32,2019-10-01,2021-10-01,0
1064,1422,14,2019-02-01,2018-12-01,1
239,356,23,2018-05-01,2021-03-01,0
1063,1421,9,2018-03-01,2018-03-01,1


In [90]:
df_final.columns

Index(['user_ID', 'chapter_ID', 'relative_renewal_date', 'P', 'A', 'L', 'M',
       'S', 'RGI', 'RGO', 'RRI', 'RRO', 'V', '1-2-1', 'TYFCB', 'CEU',
       'year_of_membership', 'control_count', 'chapter_transfer', 'temp_date',
       'chapter_size', 'chapter_retention_rate', 'chapter_growth_rate',
       'industry', 'seat_popularity_rate', 'drop_date', 'final_palms_date',
       'date_diff', 'wont_renew'],
      dtype='object')

In [91]:
df_final.to_csv("data/df_final.csv")