# Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import date

# Data Import & join
### Datasets: PALMS
First ensure that none of the files are duplicated by checking control sums.

In [2]:
start_year = 2016
start_month = 3
n_files = 59

df_temp = pd.read_csv("data/palms_report_data_2016_03.csv", index_col=0, encoding="ISO-8859-1")
sum_previous = df_temp.sum()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
    
    df_temp = pd.read_csv(f"data/palms_report_data_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    sum_current = df_temp.sum()
    
    # If all of the column sums are the same, then sum of the boolean comparison on the left
    # will be equal to the number of columns seen on the right
    if (sum_previous == sum_current).sum() == sum_current.shape[0]:
        print("Duplicated readings")
        print(f"Current file: region-palms-report_{year}_{month}.csv")
        break
    else:
        sum_previous = sum_current

The loop hasn't been broken for any of the instances which means that the files are not duplicate. At least they are not positioned month by month but it is even more unlikely that a duplicated file has been saved in a file where the month differs by more than one.

Just to double-check lets check the condition for the same file.

In [3]:
(sum_current == sum_current).sum() == sum_current.shape[0]

True

As expected - everything is working correctly. Lets import the files and concatenate them.

In [4]:
start_year = 2016
start_month = 2
n_files = 60

df_palms = pd.DataFrame()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
#     print(f"region-palms-report_{year}_{month}.csv")
    
    df_temp = pd.read_csv(f"data/palms_report_data_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    df_temp["palms_date"] = date(year, int(month), 1)

    df_palms = pd.concat([df_palms, df_temp])

column_list = df_palms.columns.tolist()
column_list = column_list[-3:-1] + column_list[:-3] + [column_list[-1]]
df_palms = df_palms[column_list]

df_palms.reset_index(inplace=True, drop=True)
df_palms

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,palms_date
0,733,1,3,0,0,0,0,0,0,0,0,0,1,0,0,2016-03-01
1,1150,1,3,0,0,0,0,0,0,0,0,0,0,0,0,2016-03-01
2,414,1,3,0,0,0,0,0,0,0,0,0,2,0,0,2016-03-01
3,1721,1,3,0,0,0,0,0,0,0,0,0,0,0,0,2016-03-01
4,1919,1,4,0,0,0,0,0,0,0,0,0,1,0,0,2016-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29211,2875,31,4,0,0,0,0,5,4,11,7,1,1,3913,2,2021-02-01
29212,276,31,4,0,0,0,0,3,2,1,0,0,3,4214,8,2021-02-01
29213,585,31,3,1,0,0,0,8,1,2,6,0,7,73,0,2021-02-01
29214,1223,31,3,1,0,0,0,2,4,7,1,0,10,1755,4,2021-02-01


### Dataset: database

In [5]:
df_database = pd.read_csv("data/database_data.csv", index_col=0, encoding="ISO-8859-1")
column_list = df_database.columns.tolist()

column_list = column_list[-3:-1] + column_list[:-3] + [column_list[-1]]
df_database = df_database[column_list]
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,Profession,Position,Join Date,Renewal Date,sponsor_ID
0,982,22,"Medical, Chiropractor",,01/07/2004,01/12/2010,
1,860,22,"Trades, Renovations-Remodeling",,01/02/2008,01/12/2008,
2,1970,22,"Food/Beverages, Chef",,01/09/2010,01/12/2011,
3,898,22,"Gifts, Gift Baskets",,01/03/2007,07/04/2008,898.0
4,2204,22,"Mortgage, Mortgage Broker",,01/07/2007,01/07/2008,
5,1465,22,"Real estate services, Residential Real Estate ...",,01/07/2007,01/04/2021,
6,2377,22,"Insurance, Life,Health and Disability Insurance",,01/07/2007,01/11/2009,
7,2214,22,"Financial, Investment Advisor",,01/07/2007,01/04/2009,
8,832,22,"Health & Wellness, Massage Therapist",,01/07/2007,01/07/2008,
9,715,22,"Insurance, General-Motor Insurance",,01/10/2007,01/10/2008,898.0


In [6]:
df_database.rename(columns={"Profession": "profession",
                            "Position": "position",
                            "Join Date": "join_date",
                            "Renewal Date": "renewal_date",}, 
                   inplace=True)


df_database["join_date"] = pd.to_datetime(df_database["join_date"], format='%d/%m/%Y', errors='coerce')
df_database["renewal_date"] = pd.to_datetime(df_database["renewal_date"], format='%d/%m/%Y', errors='coerce')
# df_database["sponsor_ID"] = pd.to_numeric(df_database["sponsor_ID"], errors='coerce', downcast='Int32')
df_database.tail()

Unnamed: 0,user_ID,chapter_ID,profession,position,join_date,renewal_date,sponsor_ID
3389,2265,37,"Construction, Builder/General Contractor",,2021-03-01,2022-03-01,2512.0
3390,2736,35,"Legal & Accounting, Real Estate Law",,2021-02-01,2022-02-01,
3391,1827,35,"Construction, Electrician",,2021-02-01,2022-02-01,2419.0
3392,257,35,"Finance & Insurance, Life,Health and Disabilit...",,2021-03-01,2022-03-01,1369.0
3393,132,35,"Finance & Insurance, Residential Mortgages",,2021-03-01,2022-03-01,


### Dataset: dropped_members

In [7]:
df_dropped_members = pd.read_csv("data/dropped_members_data.csv", index_col=0, encoding="ISO-8859-1")
column_list = df_dropped_members.columns.tolist()
column_list = column_list[-2:-1] + column_list[:-2]
df_dropped_members = df_dropped_members[column_list]

df_dropped_members.drop(["Date/Time"], axis=1, inplace=True)

df_dropped_members.rename(columns={"Effective Drop Date": "drop_date",
                                   "Reason": "drop_reason",
                                   "Drop Type": "drop_type"}, 
                          inplace=True)
df_dropped_members.head(10)

Unnamed: 0,user_ID,drop_date,drop_reason,drop_type
0,1936.0,11/01/2021,Takes Too Much Time,Resigned
1,1546.0,24/02/2021,Personal Reasons,Resigned
2,1622.0,05/02/2021,Other Reason (see notes),Resigned
3,2598.0,16/02/2021,Other Reason (see notes),Resigned
4,2760.0,16/02/2021,Member Transferred BNI Chapters,Resigned
5,1178.0,11/02/2021,Other Reason (see notes),Resigned
6,2699.0,09/02/2021,No Reason Entered,Resigned
7,2474.0,01/02/2021,Member Transferred BNI Chapters,Resigned
8,2462.0,31/01/2021,Personal Reasons,Resigned
9,1479.0,01/12/2020,,System


In [8]:
df_dropped_members.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2093 entries, 0 to 2092
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_ID      2077 non-null   float64
 1   drop_date    2093 non-null   object 
 2   drop_reason  1216 non-null   object 
 3   drop_type    2093 non-null   object 
dtypes: float64(1), object(3)
memory usage: 81.8+ KB


Seems like there are some missing values in ```user_ID``` column, which is odd. There shouldn't be any missings values in this table.

In [9]:
df_dropped_members[df_dropped_members["user_ID"].isna()]

Unnamed: 0,user_ID,drop_date,drop_reason,drop_type
1299,,16/03/2015,,Yes
1395,,25/08/2014,,Yes
1418,,15/07/2014,,Yes
1437,,27/05/2014,,Yes
1440,,21/05/2014,,Yes
1676,,09/10/2012,,Yes
1833,,22/10/2011,,Yes
1834,,22/10/2011,,Yes
1835,,22/10/2011,,Yes
1863,,15/08/2011,,Yes


All those records are listed from a time period before the PALMS data that is being looked into, so those records can be dropped.

In [10]:
df_dropped_members.dropna(subset=["user_ID"], inplace=True)
df_dropped_members.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2077 entries, 0 to 2092
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_ID      2077 non-null   float64
 1   drop_date    2077 non-null   object 
 2   drop_reason  1216 non-null   object 
 3   drop_type    2077 non-null   object 
dtypes: float64(1), object(3)
memory usage: 81.1+ KB


In [11]:
df_dropped_members["drop_date"] = pd.to_datetime(df_dropped_members["drop_date"], format='%d/%m/%Y', errors='coerce')
df_dropped_members["user_ID"] = pd.to_numeric(df_dropped_members["user_ID"], downcast='integer')
df_dropped_members.head()

Unnamed: 0,user_ID,drop_date,drop_reason,drop_type
0,1936,2021-01-11,Takes Too Much Time,Resigned
1,1546,2021-02-24,Personal Reasons,Resigned
2,1622,2021-02-05,Other Reason (see notes),Resigned
3,2598,2021-02-16,Other Reason (see notes),Resigned
4,2760,2021-02-16,Member Transferred BNI Chapters,Resigned


## Join data - create a master dataframe

In [12]:
df_master = df_palms.copy()
df_master = df_master.merge(df_database.drop("chapter_ID", axis=1), how="left", on="user_ID")
df_master = df_master.merge(df_dropped_members, how="left", on="user_ID")
df_master.drop(["drop_reason", "drop_type", "position", "sponsor_ID"], axis=1, inplace=True)
df_master.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date
5068,1226,1,3,0,0,0,0,0,3,0,3,0,2,178,1,2016-10-01,"Financial, Investment Advisor",2016-04-01,2017-10-01,NaT
15797,2547,25,3,0,0,0,2,0,0,1,0,0,1,4838,1,2017-11-01,"Real Estate, Real Estate Sales Representative ...",2014-05-01,2016-06-01,2018-01-18
42547,840,25,1,0,0,0,0,0,0,0,0,0,0,0,0,2020-09-01,"Security, Security Services",2020-09-01,2021-02-01,2019-10-04
16824,1448,9,4,0,0,0,0,2,4,1,2,2,10,0,40,2018-01-01,"Information Technology, Design Web",2017-11-01,2018-11-01,2019-11-01
21369,1311,30,4,0,0,0,0,0,8,0,1,1,10,3952,7,2018-06-01,"Financial, Investment Advisor",2017-04-01,2020-04-01,2020-03-19
43444,2184,5,4,0,0,0,0,5,18,3,6,0,24,25437,4,2020-11-01,"Trades, General Contractor",2018-09-01,2019-09-01,2018-09-12
26955,2911,31,4,0,0,0,1,3,3,1,3,0,8,7329,0,2019-01-01,"Trades, Renovations - Exterior",2017-06-01,2019-06-01,2017-04-01
23692,1004,30,4,0,0,0,0,4,10,1,1,1,6,1518,4,2018-09-01,"Real estate services, Residential Real Estate ...",2015-12-01,2021-12-01,NaT
535,818,24,5,0,0,0,0,3,0,2,6,0,6,80,0,2016-03-01,"Marketing, Digital Marketing",2015-12-01,2018-09-01,2018-05-23
5404,82,16,4,0,0,0,0,3,0,0,1,0,4,1319,11,2016-10-01,"Alternative Medicine, Nutritionist",2013-09-01,2017-09-01,2017-04-17


In [13]:
df_master.shape

(46412, 20)

# Data cleaning & aggregation
## Remove duplicates
Check different variants, depending on columns selected - how many records are dropped in each variant.

In [14]:
df_master_clean = df_master.copy()
df_master_clean["palms_date"] = pd.to_datetime(df_master_clean["palms_date"], errors='coerce')
df_master_clean["renewal_date"] = pd.to_datetime(df_master_clean["renewal_date"], errors='coerce')

df_master_clean.shape[0]

46412

In [15]:
# variant 0
df_master_clean.drop_duplicates().shape[0]

45262

In [16]:
df_master_clean.columns

Index(['user_ID', 'chapter_ID', 'P', 'A', 'L', 'M', 'S', 'RGI', 'RGO', 'RRI',
       'RRO', 'V', '1-2-1', 'TYFCB', 'CEU', 'palms_date', 'profession',
       'join_date', 'renewal_date', 'drop_date'],
      dtype='object')

In [17]:
# variant 1
df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16]).shape[0]

29215

In [18]:
# variant 2
df_master_clean.drop_duplicates(subset=["user_ID", "chapter_ID", "palms_date"]).shape[0]

29193

In [19]:
# variant 3
df_master_clean.drop_duplicates(subset=["user_ID", "palms_date"]).shape[0]

29158

Lets go with **variant 1** which is more precise than **variant 0**. It seems that there are some mix-ups in the latter columns: 'profession', 'join_date', 'renewal_date', 'drop_date'

In [20]:
df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16], inplace=True)
df_master_clean.shape

(29215, 20)

## Get relative renewal date for data aggregation

In [21]:
df_master_clean["year_of_membership"] = (df_master_clean["palms_date"] - df_master_clean["join_date"]) / np.timedelta64(1, 'M')
df_master_clean["year_of_membership"] = df_master_clean["year_of_membership"].round().astype(int) // 12
df_master_clean["months_to_renewal"] = (df_master_clean["renewal_date"] - df_master_clean["palms_date"]) / np.timedelta64(1, 'M')
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"].round().astype(int)
df_master_clean["years_to_renewal"] = (df_master_clean["months_to_renewal"] - 1) // 12
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"] % 12

# Substitute "months_to_renewal" value 0 with 12 for ease of aggregation later on
df_master_clean.loc[df_master_clean["months_to_renewal"] == 0, "months_to_renewal"] = 12

for index, row in df_master_clean.iterrows():
    df_master_clean.at[index, 'relative_renewal_date'] = row['renewal_date'] - pd.DateOffset(years=row['years_to_renewal'])

df_master_clean.drop(["years_to_renewal"], axis=1, inplace=True)
df_master_clean.head(10)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
0,733,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Food/Beverages, Caterer",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
2,1150,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Trades, Heating & A/C",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
3,414,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Insurance, Group Benefits Consultant",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
4,1721,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Marketing, Marketing Services",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
5,1919,1,4,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Real Estate, Real Estate Sales Representative ...",2016-04-01,2018-12-01,NaT,-1,9,2016-12-01
6,631,1,3,1,0,0,0,0,0,0,...,0,0,2016-03-01,"Office, Full Service Offices",2016-04-01,2017-10-01,NaT,-1,7,2016-10-01
7,2260,1,2,1,0,0,0,0,0,0,...,0,0,2016-03-01,"Information Technology, Computer Sales and Ser...",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
8,313,1,2,1,0,0,0,0,0,0,...,0,0,2016-03-01,"Health & Wellness, Gym",2016-04-01,2018-09-01,NaT,-1,6,2016-09-01
9,1267,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Accounting, Bookkeeper",2016-04-01,2019-04-01,NaT,-1,1,2016-04-01
10,1870,1,3,1,0,0,0,0,0,0,...,0,0,2016-03-01,"Legal, Solicitor",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01


## Ensure data Integrity
 1. check if ```year_of_membership``` is a negative number
 2. Remove any remaining duplicate entries

Let's tackle the first item:

### 1. Check if ```year_of_membership``` is a negative number
A bit of context: BNI chapters can be split into two categories:
1. fully established
2. forming (known as "core groups")

BNI's membership model for core groups is that an approved applicant pays for one-year membership when he first joins the group, but the ```join_date``` is set to the date of the group "launch" - when it becomes fully established as a chapter. That being said the PALMS data can be entered for core group but it only becomes available after the group is launched. For this reason there is no PALMS data for ```chapter_ID == 2```, which is a core group that has not been launched yet:

In [22]:
df_master_clean["chapter_ID"].max()

31

In [23]:
df_master_clean.loc[df_master["chapter_ID"] == 2]

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date


Unfortunately the available data does not contain information about launch date of each group nor the actual date of the actual core group member join date.

**Further discussion:** the core group members join date problem extends even further because it creates a false notion that a membership is shorter than it actually has been. An example of this can be found  in ```chapter_ID == 27``` which is a core group that has been launched sometime between 2019 and 2020:

In [24]:
df_master_clean.loc[df_master_clean["chapter_ID"] == 27].drop_duplicates("user_ID")["join_date"].value_counts().sort_index()

2016-02-01     1
2019-02-01     1
2019-10-01     1
2020-03-01    15
2020-06-01     2
2020-07-01     2
2020-10-01     1
2020-12-01     1
2021-02-01     4
Name: join_date, dtype: int64

The majority of the group, 15 members have a ```join_date``` set to **March 1st, 2020** which is most likely the date when the chapter launched. This means that any ```join_date``` after March 1st is not problematic as the person joined the group after it was launched and it is the true date of when the member joined.

There is, however, a significant problem for members who started their membership before and during March 1st. It is unclear why there are some members with a ```join_date``` prior to March 1st. Regardless, in both cases of members' ```join_date``` set to March 1st or prior to that date one cannot be sure if it is the true date when the member actually joined.

This is not the case with a well established chapter such as ```chapter_ID == 8```:

In [25]:
df_master_clean.loc[df_master_clean["chapter_ID"] == 8].drop_duplicates("user_ID")["join_date"].value_counts().sort_index()

1998-05-01    1
2002-09-01    1
2006-11-01    1
2010-08-01    1
2011-09-01    1
             ..
2020-05-01    1
2020-07-01    1
2020-12-01    2
2021-01-01    2
2021-02-01    1
Name: join_date, Length: 61, dtype: int64

Lets have a look which chapters contain members with a negative ```year_of_membership```.

In [26]:
print(sorted(df_master_clean.loc[df_master_clean["year_of_membership"] < 0, "chapter_ID"].unique()))

[1, 3, 5, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 27, 28, 30, 31]


My initial hypothesis was that only core groups have members with negative ```year_of_membership``` but it seems that fully established chapters do as well.

In this case leave the idea of looking into groups and let's look into members who have any records with negative ```year_of_membership``` and try to fix them by cross-checking with original data.

In [27]:
df_master_clean.loc[df_master_clean["year_of_membership"] < 0]

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
0,733,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Food/Beverages, Caterer",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
2,1150,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Trades, Heating & A/C",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
3,414,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Insurance, Group Benefits Consultant",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
4,1721,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Marketing, Marketing Services",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
5,1919,1,4,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Real Estate, Real Estate Sales Representative ...",2016-04-01,2018-12-01,NaT,-1,9,2016-12-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45967,1396,14,1,0,0,0,0,0,2,0,...,0,0,2021-02-01,"Real estate services, Home Inspection",2021-03-01,2022-03-01,NaT,-1,1,2021-03-01
46001,1799,15,2,0,0,0,0,0,0,0,...,0,0,2021-02-01,"Finance & Insurance, Supplemental Insurance",2021-03-01,2022-03-01,NaT,-1,1,2021-03-01
46005,17,15,1,0,0,0,0,0,0,0,...,0,0,2021-02-01,"Health & Wellness, Chiropractor",2021-03-01,2022-03-01,NaT,-1,1,2021-03-01
46010,2195,16,1,0,0,0,0,0,0,0,...,0,0,2021-02-01,"Advertising & Marketing, Photographer",2021-03-01,2022-03-01,NaT,-1,1,2021-03-01


In [28]:
# Get unique 'user_ID' of members who have at least one month of negative membership
df_master_clean.loc[df_master_clean["year_of_membership"] < 0, "user_ID"].unique()

array([ 733, 1150,  414, 1721, 1919,  631, 2260,  313, 1267, 1870,  920,
       1320,  740, 1074, 2105, 1226, 2162, 1764, 2471,  540,  479, 2764,
        282, 1964, 1997,  676, 1217,  839, 2237, 2883, 2279, 2464,  475,
       2053, 2613, 2164, 1314,  454, 1381, 2911, 1136,  790,  108,  816,
       2679, 1646, 1168, 1953, 1957, 2648, 2568,  970, 2721,  651, 2358,
       1081, 2681,  219, 1146, 2709,  443, 2205, 1832,  258, 2015, 2791,
       1033,  344, 2546, 1671, 1236,    9, 1659, 1421,  554,  658, 2640,
        774, 2513, 2024, 2497, 1268, 2549,   18, 1485,  941, 1714, 2292,
       2394, 1916, 1201, 2891,  104,  752,  327, 1846, 1942, 1700, 1522,
        103, 1606,  448,  230, 1270, 2001, 2175, 1084, 2850,  929, 1175,
       2778,  333, 1703, 2673, 1260, 2657,  944, 1597, 2563, 1855,  328,
       1295,  582, 1292, 1593, 1818,  675, 2842,  626, 2772,  701, 1388,
       2077,   41, 2896, 2853, 1608, 1417, 1120,  216, 1583,  889, 2854,
       2427,  639, 2785,  965, 2692, 1922, 1117,  8

There seems to be many records that have a negative ```year_of_membership``` and it would be too tedious too look into every single one specifically. There might however be a pattern that the first one or two months of PALMS when the member joins is before the actual ```join_date```. Having this in mind, I recall that a member was allowed to start attending chapter meetings a little bit before he was entered into the system. Records with one or two months prior to the members' ```join_date``` aren't a problem as they can safely be dropped and excluded from the aggregation later on.

There is, however, a **problem if a member has more that two months of negative** ```year_of_membership``` because it might indicate a different situation than above. Let's create a new dataframe which counts how many PALMS records contain a negative ```year_of_membership``` per user.

In [29]:
df_negative = df_master_clean.copy()
df_negative["negative_months"] = 1
df_negative = df_negative.loc[df_negative["year_of_membership"] < 0].groupby("user_ID")[["negative_months"]].count()
df_negative

Unnamed: 0_level_0,negative_months
user_ID,Unnamed: 1_level_1
6,8
9,1
17,1
18,1
26,1
...,...
2903,1
2904,1
2911,1
2915,1


There are 344 users with at least one month with negative ```year_of_membership```.

### Members with two or less negative months
First lets look at a few random samples members who have 2 or less records with ```negative_months``` to ensure that they are indeed just a few months before the actual join date. If those those few members will indeed have just a month or two **before** their ```join_date``` then those months can be disregarded and dropped - this data should not be taken into account for the aggregation as it would produce incorrect results.

In [30]:
df_negative.loc[df_negative["negative_months"] <= 2].shape[0]

321

In [31]:
df_negative.loc[df_negative["negative_months"] <= 2].sample(10, random_state=13)

Unnamed: 0_level_0,negative_months
user_ID,Unnamed: 1_level_1
485,1
2278,1
108,1
1074,1
1136,1
1846,1
1881,1
1421,1
1320,1
1201,1


#### - User 485

In [32]:
df_master_clean.loc[df_master_clean["user_ID"] == 485].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
37290,485,28,1,0,0,0,0,0,1,0,...,0,0,2020-02-01,"Finance & Insurance, Commercial Insurance",2020-03-01,2021-03-01,NaT,-1,1,2020-03-01
38073,485,28,4,0,0,0,0,5,1,5,...,7167,4,2020-03-01,"Finance & Insurance, Commercial Insurance",2020-03-01,2021-03-01,NaT,0,12,2021-03-01
38860,485,28,5,0,0,0,0,2,4,2,...,3030,4,2020-04-01,"Finance & Insurance, Commercial Insurance",2020-03-01,2021-03-01,NaT,0,11,2021-03-01
39627,485,28,4,0,0,0,0,2,7,1,...,2474,3,2020-05-01,"Finance & Insurance, Commercial Insurance",2020-03-01,2021-03-01,NaT,0,10,2021-03-01
40376,485,28,4,0,0,0,0,0,3,0,...,4234,7,2020-06-01,"Finance & Insurance, Commercial Insurance",2020-03-01,2021-03-01,NaT,0,9,2021-03-01


Indeed there is only one month where the ```year_of_membership``` is negative for user 485 and it is the one month prior to the ```join_date```. Having a look at the "PALMS" section the member attended 1 meeting before the noted ```join_date```: 2020-02-01. This one record could be safely dropped.
#### - User 2278

In [33]:
df_master_clean.loc[df_master_clean["user_ID"] == 2278].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
16980,2278,13,2,0,0,0,0,4,0,5,...,0,0,2018-01-01,"Marketing, Marketing Strategy",2018-02-01,2019-02-01,2019-01-09,-1,1,2018-02-01
17797,2278,13,4,0,0,0,0,9,0,6,...,1201,6,2018-02-01,"Marketing, Marketing Strategy",2018-02-01,2019-02-01,2019-01-09,0,12,2019-02-01
18583,2278,13,4,0,0,0,0,1,2,1,...,5579,38,2018-03-01,"Marketing, Marketing Strategy",2018-02-01,2019-02-01,2019-01-09,0,11,2019-02-01
19366,2278,13,3,1,0,0,0,4,1,0,...,498,11,2018-04-01,"Marketing, Marketing Strategy",2018-02-01,2019-02-01,2019-01-09,0,10,2019-02-01
20154,2278,13,5,0,0,0,0,2,1,3,...,4320,16,2018-05-01,"Marketing, Marketing Strategy",2018-02-01,2019-02-01,2019-01-09,0,9,2019-02-01


Same case as User 485
#### - User 108 

In [34]:
df_master_clean.loc[df_master_clean["user_ID"] == 108].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
784,108,5,1,0,0,0,0,0,0,0,...,0,0,2016-04-01,"Information Technology, I.T. Specialist",2016-05-01,2017-05-01,2016-08-03,-1,1,2016-05-01
1493,108,5,5,0,0,0,0,3,0,1,...,0,1,2016-05-01,"Information Technology, I.T. Specialist",2016-05-01,2017-05-01,2016-08-03,0,12,2017-05-01
2210,108,5,4,0,0,0,0,1,1,0,...,1149,0,2016-06-01,"Information Technology, I.T. Specialist",2016-05-01,2017-05-01,2016-08-03,0,11,2017-05-01
2937,108,5,4,0,0,0,0,1,5,0,...,0,0,2016-07-01,"Information Technology, I.T. Specialist",2016-05-01,2017-05-01,2016-08-03,0,10,2017-05-01
3648,108,5,2,0,0,0,0,0,0,0,...,0,2,2016-08-01,"Information Technology, I.T. Specialist",2016-05-01,2017-05-01,2016-08-03,0,9,2017-05-01


#### - User 1074

In [35]:
df_master_clean.loc[df_master_clean["user_ID"] == 1074].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
20,1074,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Employment, Professional Recruitment",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
737,1074,1,2,0,1,0,1,0,0,0,...,0,12,2016-04-01,"Employment, Professional Recruitment",2016-04-01,2017-04-01,NaT,0,12,2017-04-01
1446,1074,1,3,1,0,0,0,2,0,0,...,6590,1,2016-05-01,"Employment, Professional Recruitment",2016-04-01,2017-04-01,NaT,0,11,2017-04-01
2161,1074,1,4,0,0,0,1,1,1,0,...,3295,1,2016-06-01,"Employment, Professional Recruitment",2016-04-01,2017-04-01,NaT,0,10,2017-04-01
2890,1074,1,2,0,0,0,1,0,0,0,...,3295,0,2016-07-01,"Employment, Professional Recruitment",2016-04-01,2017-04-01,NaT,0,9,2017-04-01


#### - User 1136

In [36]:
df_master_clean.loc[df_master_clean["user_ID"] == 1136].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
695,1136,31,2,0,0,0,0,1,0,0,...,0,0,2016-03-01,"Health & Wellness, Fitness Trainer",2016-04-01,2018-04-01,2017-07-04,-1,1,2016-04-01
1399,1136,31,4,0,0,0,0,4,1,1,...,134,0,2016-04-01,"Health & Wellness, Fitness Trainer",2016-04-01,2018-04-01,2017-07-04,0,12,2017-04-01
2118,1136,31,4,1,0,0,0,2,2,2,...,527,0,2016-05-01,"Health & Wellness, Fitness Trainer",2016-04-01,2018-04-01,2017-07-04,0,11,2017-04-01
2847,1136,31,4,0,0,0,0,3,0,0,...,533,0,2016-06-01,"Health & Wellness, Fitness Trainer",2016-04-01,2018-04-01,2017-07-04,0,10,2017-04-01
3562,1136,31,4,0,0,0,0,3,0,0,...,79,0,2016-07-01,"Health & Wellness, Fitness Trainer",2016-04-01,2018-04-01,2017-07-04,0,9,2017-04-01


#### - User 1846

In [37]:
df_master_clean.loc[df_master_clean["user_ID"] == 1846].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
7616,1846,16,2,0,0,0,0,0,0,0,...,0,0,2017-01-01,"Financial, Investment Advisor",2017-02-01,2019-02-01,2018-06-11,-1,1,2017-02-01
8346,1846,16,4,0,0,0,0,3,3,0,...,0,2,2017-02-01,"Financial, Investment Advisor",2017-02-01,2019-02-01,2018-06-11,0,12,2018-02-01
9074,1846,16,5,0,0,0,0,0,3,0,...,0,33,2017-03-01,"Financial, Investment Advisor",2017-02-01,2019-02-01,2018-06-11,0,11,2018-02-01
9803,1846,16,4,0,0,0,0,1,1,0,...,0,6,2017-04-01,"Financial, Investment Advisor",2017-02-01,2019-02-01,2018-06-11,0,10,2018-02-01
10587,1846,16,5,0,0,0,0,1,3,0,...,1527,6,2017-05-01,"Financial, Investment Advisor",2017-02-01,2019-02-01,2018-06-11,0,9,2018-02-01


#### - User 1881

In [38]:
df_master_clean.loc[df_master_clean["user_ID"] == 1881].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
15527,1881,18,2,0,0,0,0,0,2,0,...,0,0,2017-11-01,"Health & Wellness, Nutritionist",2017-12-01,2021-12-01,NaT,-1,1,2017-12-01
16298,1881,18,3,0,0,0,0,2,0,0,...,60,6,2017-12-01,"Health & Wellness, Nutritionist",2017-12-01,2021-12-01,NaT,0,12,2018-12-01
17109,1881,18,4,0,0,0,0,1,5,1,...,1223,6,2018-01-01,"Health & Wellness, Nutritionist",2017-12-01,2021-12-01,NaT,0,11,2018-12-01
17916,1881,18,4,0,0,0,0,1,7,2,...,190,6,2018-02-01,"Health & Wellness, Nutritionist",2017-12-01,2021-12-01,NaT,0,10,2018-12-01
18702,1881,18,3,0,0,0,1,1,2,1,...,80,5,2018-03-01,"Health & Wellness, Nutritionist",2017-12-01,2021-12-01,NaT,0,9,2018-12-01


#### - User 1421

In [39]:
df_master_clean.loc[df_master_clean["user_ID"] == 1421].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
5083,1421,3,1,0,0,0,0,0,0,0,...,0,0,2016-10-01,"Legal, Lawyer Family Law",2016-11-01,2017-11-01,NaT,-1,1,2016-11-01
5850,1421,3,3,0,1,0,1,0,0,0,...,0,1,2016-11-01,"Legal, Lawyer Family Law",2016-11-01,2017-11-01,NaT,0,12,2017-11-01
6594,1421,3,4,0,0,0,0,0,3,0,...,0,3,2016-12-01,"Legal, Lawyer Family Law",2016-11-01,2017-11-01,NaT,0,11,2017-11-01
7332,1421,3,3,1,1,0,0,1,0,0,...,303,4,2017-01-01,"Legal, Lawyer Family Law",2016-11-01,2017-11-01,NaT,0,10,2017-11-01
8066,1421,3,3,1,0,0,0,3,3,1,...,40,3,2017-02-01,"Legal, Lawyer Family Law",2016-11-01,2017-11-01,NaT,0,9,2017-11-01


#### - User 1320

In [40]:
df_master_clean.loc[df_master_clean["user_ID"] == 1320].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
18,1320,1,4,0,0,0,0,0,0,0,...,0,8,2016-03-01,"Accounting, Accountant",2016-04-01,2018-06-01,NaT,-1,3,2016-06-01
735,1320,1,3,1,0,0,0,1,0,0,...,100,4,2016-04-01,"Accounting, Accountant",2016-04-01,2018-06-01,NaT,0,2,2016-06-01
1444,1320,1,3,1,0,0,0,1,0,1,...,105,2,2016-05-01,"Accounting, Accountant",2016-04-01,2018-06-01,NaT,0,1,2016-06-01
2159,1320,1,4,1,0,0,0,0,0,1,...,0,2,2016-06-01,"Accounting, Accountant",2016-04-01,2018-06-01,NaT,0,12,2017-06-01
2888,1320,1,3,0,0,0,0,0,0,0,...,185,0,2016-07-01,"Accounting, Accountant",2016-04-01,2018-06-01,NaT,0,11,2017-06-01


#### - User 1201

In [41]:
df_master_clean.loc[df_master_clean["user_ID"] == 1201].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
6207,1201,17,2,0,0,0,0,0,4,1,...,0,0,2016-11-01,"Photography, Photographer",2016-12-01,2020-06-01,2020-07-12,-1,7,2017-06-01
6937,1201,17,2,0,0,0,0,1,4,1,...,0,6,2016-12-01,"Photography, Photographer",2016-12-01,2020-06-01,2020-07-12,0,6,2017-06-01
7673,1201,17,4,0,0,0,0,4,12,1,...,175,24,2017-01-01,"Photography, Photographer",2016-12-01,2020-06-01,2020-07-12,0,5,2017-06-01
8397,1201,17,3,0,0,0,1,0,7,1,...,88,10,2017-02-01,"Photography, Photographer",2016-12-01,2020-06-01,2020-07-12,0,4,2017-06-01
9123,1201,17,5,0,0,0,0,3,10,2,...,1810,63,2017-03-01,"Photography, Photographer",2016-12-01,2020-06-01,2020-07-12,0,3,2017-06-01


**Conclusion**

Having a look at randomly sampled 10 members it is clear that there is a pattern for members with two or less records with negative ```year_of_membership```: this one month is always before the actual ```join_date``` and can be disregarded and dropped. This will be done, however, after dealing with members who have at lest two negative months.

### Members with at least two negative months
The data for each member can be cross-checked and  the incorrect records corrected based on the ```df_database``` dataset.

In [42]:
df_negative.loc[df_negative["negative_months"] > 2].sort_values("negative_months", ascending=False)

Unnamed: 0_level_0,negative_months
user_ID,Unnamed: 1_level_1
2764,33
2883,30
1767,29
965,27
35,25
2411,25
1686,21
274,20
2692,19
889,18


In [43]:
df_negative.loc[df_negative["negative_months"] > 2].sort_values("negative_months", ascending=False).shape

(23, 1)

### Cleaning single users:
Cleaning those records requires some expertise about the data - namely if a member dropped and rejoined sometime later or perhaps transferred chapters.

My methodology per each user consists of:
1. Check users PALMS records and see if there are any discrepancies in: ```chapter_ID```, ```join_date``` or ```renewal_date```
2. Pull up ```df_database``` to see all user ```join_date```s:
 - If there are multiple and the member transferred chapters - impute all ```join_date```s in the PALMS data with the oldest date from ```df_database```
 - If there are multiple and the member left BNI and rejoined sometime later - attempt to calculate the cumulative membership year and separate those PALMS records from the original for later merging.
3. Pull up ```df_database``` to see all user ```drop_date```s:
 - If there are none - assume that the member transferred chapters and never left BNI
 - Check the most recent PALMS date which contains the members' data.

#### - User 2764

In [44]:
df_master_clean.loc[df_master_clean["user_ID"] == 2764].sample(5, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
2916,2764,3,3,0,0,0,1,1,0,0,...,1670,1,2016-07-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-3,8,2017-03-01
5858,2764,3,5,0,0,0,0,0,4,0,...,2753,18,2016-11-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-3,4,2017-03-01
42933,2764,11,3,0,0,0,1,1,7,0,...,505,5,2020-10-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,1,5,2021-03-01
11910,2764,3,3,1,0,0,0,0,2,0,...,0,2,2017-07-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-2,8,2018-03-01
43664,2764,11,3,0,0,0,0,1,6,0,...,870,3,2020-11-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,1,4,2021-03-01


This user has the same ```chapter_ID``` for all of his records but a varying ```drop_date```. Having a look at the ```df_database``` it seems as though the member transferred chapters from **3** to **11**.

In [45]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2764) & (df_master_clean["chapter_ID"] == 3)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
22211,2764,3,3,0,0,0,1,0,2,0,...,0,5,2018-08-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-1,7,2019-03-01
22980,2764,3,3,0,0,0,1,1,2,0,...,0,9,2018-09-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-1,6,2019-03-01
23791,2764,3,4,0,0,0,0,0,4,0,...,0,11,2018-10-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-1,5,2019-03-01
24595,2764,3,4,0,0,0,0,0,2,0,...,0,5,2018-11-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,-1,4,2019-03-01
25370,2764,3,2,0,0,0,0,0,3,0,...,0,5,2018-12-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,0,3,2019-03-01


In [46]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2764) & (df_master_clean["chapter_ID"] == 11)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
25649,2764,11,2,0,0,0,0,0,2,0,...,0,3,2018-12-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,0,3,2019-03-01
26460,2764,11,4,0,0,0,0,1,8,3,...,23,12,2019-01-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,0,2,2019-03-01
27297,2764,11,4,0,0,0,0,1,8,5,...,500,9,2019-02-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,0,1,2019-03-01
28146,2764,11,4,0,0,0,0,1,7,1,...,223,4,2019-03-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,0,12,2020-03-01
28983,2764,11,3,0,0,0,1,1,4,0,...,860,6,2019-04-01,"Finance & Insurance, Financial Advisor/Financi...",2018-12-01,2022-03-01,NaT,0,11,2020-03-01


Fix transfer date data.

In [47]:
df_master_clean.loc[[25649], df_master_clean.columns[2:15]] = df_master_clean.loc[25649, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[25370, df_master_clean.columns[2:15]].values

df_master_clean.drop(25370, axis=0, inplace=True)

In [48]:
df_database.loc[df_database["user_ID"] == 2764, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
722,2018-12-01
2773,2016-03-01


His ```join_date``` can be corrected to 2016-03-01 according to the ```df_database```.

In [49]:
df_master_clean.loc[df_master_clean["user_ID"] == 2764, "join_date"] = pd.Timestamp("2016-03-01")

In [50]:
df_master_clean.loc[df_master_clean["user_ID"] == 2764, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [51]:
df_master_clean.loc[df_master_clean["user_ID"] == 2764, "drop_date"] = pd.NaT

#### - User 2883

In [52]:
df_master_clean.loc[df_master_clean["user_ID"] == 2883].sample(5, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
3194,2883,16,4,0,0,0,0,0,1,0,...,0,3,2016-07-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-3,8,2017-03-01
6159,2883,16,5,0,0,0,0,2,1,1,...,415,17,2016-11-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-2,4,2017-03-01
42809,2883,8,5,0,0,0,0,2,3,1,...,200,19,2020-10-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,2,5,2021-03-01
12240,2883,16,4,0,0,0,0,0,2,2,...,1401,9,2017-07-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-2,8,2018-03-01
43544,2883,8,4,0,0,0,0,2,2,0,...,357,19,2020-11-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,2,4,2021-03-01


Similar case to user 2764 - group transfer from **16** to **8**.

In [53]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2883) & (df_master_clean["chapter_ID"] == 16)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
20230,2883,16,3,0,1,0,1,0,2,0,...,0,17,2018-05-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-1,10,2019-03-01
21025,2883,16,3,0,1,0,0,0,4,2,...,194,12,2018-06-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-1,9,2019-03-01
21793,2883,16,3,0,1,0,0,0,0,0,...,0,24,2018-07-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-1,8,2019-03-01
22568,2883,16,4,0,1,0,0,0,7,2,...,2430,41,2018-08-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,-1,7,2019-03-01
23354,2883,16,0,2,0,0,0,0,0,0,...,300,0,2018-09-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,0,6,2019-03-01


In [54]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2883) & (df_master_clean["chapter_ID"] == 8)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
23114,2883,8,4,0,0,0,0,0,3,1,...,904,16,2018-09-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,0,6,2019-03-01
23926,2883,8,4,0,0,0,0,1,2,0,...,572,18,2018-10-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,0,5,2019-03-01
24715,2883,8,5,0,0,0,0,1,2,1,...,0,17,2018-11-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,0,4,2019-03-01
25528,2883,8,3,0,0,0,0,0,5,0,...,575,3,2018-12-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,0,3,2019-03-01
26338,2883,8,4,0,0,0,0,0,9,0,...,0,41,2019-01-01,"Finance & Insurance, Credit Card/Merchant Serv...",2018-09-01,2022-03-01,2018-09-06,0,2,2019-03-01


In [55]:
df_master_clean.loc[[23114], df_master_clean.columns[2:15]] = df_master_clean.loc[23114, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[23354, df_master_clean.columns[2:15]].values
df_master_clean.drop(23354, axis=0, inplace=True)

In [56]:
df_database.loc[df_database["user_ID"] == 2883, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
432,2018-09-01
2027,2015-12-01


In [57]:
df_master_clean.loc[df_master_clean["user_ID"] == 2883, "join_date"] = pd.Timestamp("2015-12-01")

In [58]:
df_master_clean.loc[df_master_clean["user_ID"] == 2883, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [59]:
df_master_clean.loc[df_master_clean["user_ID"] == 2883, "drop_date"] = pd.NaT

#### - User 1767 

In [60]:
df_master_clean.loc[df_master_clean["user_ID"] == 1767].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
40753,1767,14,4,0,0,0,0,2,1,1,...,630,2,2020-07-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,11,2021-06-01
38467,1767,14,4,0,0,0,0,1,3,0,...,135,2,2020-04-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,2,2020-06-01
39248,1767,14,3,1,0,0,0,1,1,2,...,0,1,2020-05-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,1,2020-06-01
44296,1767,9,5,0,0,0,0,0,3,3,...,6343,1,2020-12-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,6,2021-06-01
31586,1767,14,5,0,0,0,0,2,1,2,...,0,6,2019-07-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-2,11,2020-06-01
43578,1767,9,4,0,0,0,0,3,1,5,...,3622,1,2020-11-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,7,2021-06-01
23317,1767,14,2,0,0,0,2,0,1,1,...,390,7,2018-09-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-3,9,2019-06-01
34655,1767,14,4,0,0,0,0,0,3,2,...,67,3,2019-11-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,7,2020-06-01


In [61]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1767) & (df_master_clean["chapter_ID"] == 14)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
39248,1767,14,3,1,0,0,0,1,1,2,...,0,1,2020-05-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,1,2020-06-01
40003,1767,14,5,0,0,0,0,0,2,1,...,305,4,2020-06-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,12,2021-06-01
40753,1767,14,4,0,0,0,0,2,1,1,...,630,2,2020-07-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,11,2021-06-01
41506,1767,14,4,0,0,0,0,0,0,0,...,505,0,2020-08-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,10,2021-06-01
42250,1767,14,4,0,0,0,0,0,0,0,...,116,0,2020-09-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,-1,9,2021-06-01


In [62]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1767) & (df_master_clean["chapter_ID"] == 9)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
42847,1767,9,5,0,0,0,0,3,1,4,...,0,1,2020-10-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,8,2021-06-01
43578,1767,9,4,0,0,0,0,3,1,5,...,3622,1,2020-11-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,7,2021-06-01
44296,1767,9,5,0,0,0,0,0,3,3,...,6343,1,2020-12-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,6,2021-06-01
45030,1767,9,4,0,0,0,0,2,0,4,...,1115,2,2021-01-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,5,2021-06-01
45804,1767,9,4,0,0,0,0,3,0,2,...,1205,2,2021-02-01,"Health & Wellness, Chiropractor",2020-10-01,2022-06-01,2020-09-01,0,4,2021-06-01


In [63]:
df_database.loc[df_database["user_ID"] == 1767, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1599,2020-10-01
2701,2018-05-01


In [64]:
df_master_clean.loc[df_master_clean["user_ID"] == 1767, "join_date"] = pd.Timestamp("2018-05-01")

In [65]:
df_master_clean.loc[df_master_clean["user_ID"] == 1767, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [66]:
df_master_clean.loc[df_master_clean["user_ID"] == 1767, "drop_date"] = pd.NaT

#### - User 965 

In [67]:
df_master_clean.loc[df_master_clean["user_ID"] == 965].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
41700,965,22,3,1,0,0,0,1,1,3,...,0,2,2020-08-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,12,2021-08-01
40205,965,22,4,0,0,0,0,1,0,0,...,263,5,2020-06-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,2,2020-08-01
32533,965,20,3,0,1,0,0,1,1,0,...,811,2,2019-08-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-1,12,2020-08-01
34854,965,22,3,0,0,0,0,1,0,0,...,111,3,2019-11-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,9,2020-08-01
24299,965,20,2,1,0,0,1,1,2,1,...,932,1,2018-10-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-2,10,2019-08-01
35581,965,22,3,0,0,0,0,0,1,0,...,355,5,2019-12-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,8,2020-08-01
30890,965,20,4,0,0,0,0,2,4,1,...,1260,9,2019-06-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-1,2,2019-08-01
19546,965,20,3,0,1,0,0,0,0,3,...,1000,2,2018-04-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-2,4,2018-08-01


In [68]:
df_master_clean.loc[(df_master_clean["user_ID"] == 965) & (df_master_clean["chapter_ID"] == 20)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
31736,965,20,4,0,0,0,1,2,1,0,...,90,2,2019-07-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-1,1,2019-08-01
32533,965,20,3,0,1,0,0,1,1,0,...,811,2,2019-08-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-1,12,2020-08-01
33306,965,20,0,0,2,0,2,1,1,0,...,113,2,2019-09-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-1,11,2020-08-01
34077,965,20,4,0,0,0,1,1,2,0,...,167,4,2019-10-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,-1,10,2020-08-01
34807,965,20,0,1,0,0,0,0,1,1,...,0,0,2019-11-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,9,2020-08-01


In [69]:
df_master_clean.loc[(df_master_clean["user_ID"] == 965) & (df_master_clean["chapter_ID"] == 22)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
34854,965,22,3,0,0,0,0,1,0,0,...,111,3,2019-11-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,9,2020-08-01
35581,965,22,3,0,0,0,0,0,1,0,...,355,5,2019-12-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,8,2020-08-01
36319,965,22,4,0,0,0,0,1,3,0,...,0,3,2020-01-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,7,2020-08-01
37075,965,22,4,0,0,0,0,0,2,2,...,184,2,2020-02-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,6,2020-08-01
37890,965,22,3,0,0,0,0,1,3,1,...,142,2,2020-03-01,"Finance & Insurance, Life,Health and Disabilit...",2019-11-01,2021-08-01,2019-11-01,0,5,2020-08-01


In [70]:
df_master_clean.loc[[34854], df_master_clean.columns[2:15]] = df_master_clean.loc[34854, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[34807, df_master_clean.columns[2:15]].values
df_master_clean.drop(34807, axis=0, inplace=True)

In [71]:
df_database.loc[df_database["user_ID"] == 965, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
196,2019-11-01
2947,2017-08-01


In [72]:
df_master_clean.loc[df_master_clean["user_ID"] == 965, "join_date"] = pd.Timestamp("2017-08-01")

In [73]:
df_master_clean.loc[df_master_clean["user_ID"] == 965, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [74]:
df_master_clean.loc[df_master_clean["user_ID"] == 965, "drop_date"] = pd.NaT

#### - User 35 

In [75]:
df_master_clean.loc[df_master_clean["user_ID"] == 35].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
33400,35,23,4,0,0,0,0,0,0,4,...,0,5,2019-09-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,9,2020-06-01
39483,35,23,4,0,0,0,0,2,1,8,...,1538,0,2020-05-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,1,2020-06-01
36358,35,23,3,1,0,0,0,1,1,5,...,0,3,2020-01-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,5,2020-06-01
31817,35,23,2,1,0,0,1,2,0,12,...,0,2,2019-07-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,11,2020-06-01
32615,35,23,5,0,0,0,0,2,0,6,...,332,3,2019-08-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,10,2020-06-01
42292,35,17,2,0,0,3,0,1,1,0,...,0,1,2020-09-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,9,2021-06-01
23570,35,23,3,0,0,0,1,2,0,0,...,458,9,2018-09-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-2,9,2019-06-01
26781,35,23,4,0,0,0,0,1,2,1,...,16,3,2019-01-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-2,5,2019-06-01


In [76]:
df_master_clean.loc[(df_master_clean["user_ID"] == 35) & (df_master_clean["chapter_ID"] == 23)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
37114,35,23,4,0,0,0,0,3,0,9,...,0,5,2020-02-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,4,2020-06-01
37917,35,23,2,1,0,0,0,0,0,3,...,0,0,2020-03-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,3,2020-06-01
38703,35,23,3,1,1,0,0,0,0,6,...,209,0,2020-04-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,2,2020-06-01
39483,35,23,4,0,0,0,0,2,1,8,...,1538,0,2020-05-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,-1,1,2020-06-01
40231,35,23,1,1,0,0,0,0,0,0,...,219,0,2020-06-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,12,2021-06-01


In [77]:
df_master_clean.loc[(df_master_clean["user_ID"] == 35) & (df_master_clean["chapter_ID"] == 17)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
40051,35,17,2,0,0,1,0,1,0,1,...,0,0,2020-06-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,12,2021-06-01
40802,35,17,3,1,0,0,0,0,2,5,...,40,0,2020-07-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,11,2021-06-01
41550,35,17,2,2,0,0,0,0,1,4,...,0,1,2020-08-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,10,2021-06-01
42292,35,17,2,0,0,3,0,1,1,0,...,0,1,2020-09-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,9,2021-06-01
43030,35,17,3,1,0,0,0,2,1,1,...,2453,1,2020-10-01,"Food & Beverage, Cater",2020-06-01,2021-06-01,2020-06-09,0,8,2021-06-01


In [78]:
df_master_clean.loc[[40051], df_master_clean.columns[2:15]] = df_master_clean.loc[40051, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[40231, df_master_clean.columns[2:15]].values
df_master_clean.drop(40231, axis=0, inplace=True)

In [79]:
df_database.loc[df_database["user_ID"] == 35, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2394,2020-06-01
2909,2018-06-01


In [80]:
df_master_clean.loc[df_master_clean["user_ID"] == 35, "join_date"] = pd.Timestamp("2018-06-01")

In [81]:
df_master_clean.loc[df_master_clean["user_ID"] == 35, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [82]:
df_master_clean.loc[df_master_clean["user_ID"] == 35, "drop_date"] = pd.NaT

#### - User 2411 

In [83]:
df_master_clean.loc[df_master_clean["user_ID"] == 2411].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
31747,2411,20,5,0,0,0,0,1,2,0,...,0,6,2019-07-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,7,2020-02-01
40213,2411,22,3,1,0,0,0,0,0,0,...,0,4,2020-06-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,8,2021-02-01
36282,2411,20,2,0,0,0,0,0,0,0,...,784,0,2020-01-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,1,2020-02-01
33318,2411,20,2,0,1,0,1,0,0,0,...,0,6,2019-09-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,5,2020-02-01
34820,2411,20,2,1,0,0,1,1,1,0,...,2820,2,2019-11-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,3,2020-02-01
40960,2411,22,4,1,0,0,0,0,1,0,...,0,5,2020-07-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,7,2021-02-01
39465,2411,22,3,0,0,1,0,0,0,1,...,0,4,2020-05-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,9,2021-02-01
23517,2411,20,3,1,0,0,0,1,2,0,...,0,4,2018-09-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-2,5,2019-02-01


In [84]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2411) & (df_master_clean["chapter_ID"] == 20)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
33318,2411,20,2,0,1,0,1,0,0,0,...,0,6,2019-09-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,5,2020-02-01
34090,2411,20,3,1,0,0,1,1,3,0,...,0,5,2019-10-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,4,2020-02-01
34820,2411,20,2,1,0,0,1,1,1,0,...,2820,2,2019-11-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,3,2020-02-01
35546,2411,20,3,0,0,0,0,3,1,0,...,1544,6,2019-12-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,2,2020-02-01
36282,2411,20,2,0,0,0,0,0,0,0,...,784,0,2020-01-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,-1,1,2020-02-01


In [85]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2411) & (df_master_clean["chapter_ID"] == 22)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
37095,2411,22,3,0,0,0,0,0,0,0,...,156,0,2020-02-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,12,2021-02-01
37898,2411,22,3,0,0,0,0,1,1,1,...,208,5,2020-03-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,11,2021-02-01
38685,2411,22,4,1,0,0,0,0,1,1,...,0,6,2020-04-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,10,2021-02-01
39465,2411,22,3,0,0,1,0,0,0,1,...,0,4,2020-05-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,9,2021-02-01
40213,2411,22,3,1,0,0,0,0,0,0,...,0,4,2020-06-01,"Architecture & Engineering, Interior Architecture",2020-02-01,2022-02-01,2020-01-01,0,8,2021-02-01


In [86]:
df_database.loc[df_database["user_ID"] == 2411, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
202,2020-02-01
2980,2018-01-01


In [87]:
df_master_clean.loc[df_master_clean["user_ID"] == 2411, "join_date"] = pd.Timestamp("2018-01-01")

In [88]:
df_master_clean.loc[df_master_clean["user_ID"] == 2411, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [89]:
df_master_clean.loc[df_master_clean["user_ID"] == 2411, "drop_date"] = pd.NaT

#### - User 1686 

In [90]:
df_master_clean.loc[df_master_clean["user_ID"] == 1686].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
28187,1686,13,2,0,1,0,1,1,4,0,...,511,6,2019-03-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,12,2020-03-01
43285,1686,25,5,0,0,0,0,1,2,2,...,3006,3,2020-10-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,5,2021-03-01
36463,1686,25,4,0,0,0,0,3,1,0,...,2326,3,2020-01-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,2,2020-03-01
40317,1686,25,4,0,0,0,0,1,3,0,...,792,0,2020-06-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,9,2021-03-01
33118,1686,13,4,0,0,0,0,0,5,0,...,450,12,2019-09-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,6,2020-03-01
31528,1686,13,3,0,0,2,0,1,3,0,...,1190,2,2019-07-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,8,2020-03-01
41808,1686,25,3,0,0,0,1,2,1,1,...,0,1,2020-08-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,7,2021-03-01
23270,1686,13,4,0,0,0,0,1,3,0,...,2145,19,2018-09-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-2,6,2019-03-01


In [91]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1686) & (df_master_clean["chapter_ID"] == 13)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
29867,1686,13,3,0,0,0,2,2,3,2,...,583,1,2019-05-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,10,2020-03-01
30691,1686,13,4,0,0,0,0,0,2,2,...,1267,2,2019-06-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,9,2020-03-01
31528,1686,13,3,0,0,2,0,1,3,0,...,1190,2,2019-07-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,8,2020-03-01
32345,1686,13,4,0,0,0,0,3,1,0,...,0,17,2019-08-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,7,2020-03-01
33118,1686,13,4,0,0,0,0,0,5,0,...,450,12,2019-09-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,-1,6,2020-03-01


In [92]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1686) & (df_master_clean["chapter_ID"] == 25)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
35006,1686,25,3,0,0,0,1,3,1,2,...,131,7,2019-11-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,4,2020-03-01
35727,1686,25,3,0,0,0,0,4,0,0,...,345,4,2019-12-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,3,2020-03-01
36463,1686,25,4,0,0,0,0,3,1,0,...,2326,3,2020-01-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,2,2020-03-01
37221,1686,25,4,0,0,0,0,1,5,1,...,453,1,2020-02-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,1,2020-03-01
38002,1686,25,3,0,0,0,0,1,3,1,...,609,3,2020-03-01,"Legal & Accounting, Accounting Services",2019-11-01,2021-03-01,2019-09-19,0,12,2021-03-01


In [93]:
df_database.loc[df_database["user_ID"] == 1686, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1178,2019-11-01
2865,2017-03-01
3047,2018-02-01


In [94]:
df_master_clean.loc[df_master_clean["user_ID"] == 1686, "join_date"] = pd.Timestamp("2017-03-01")

In [95]:
df_master_clean.loc[df_master_clean["user_ID"] == 1686, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [96]:
df_master_clean.loc[df_master_clean["user_ID"] == 1686, "drop_date"] = pd.NaT

#### - User 274 

In [97]:
df_master_clean.loc[df_master_clean["user_ID"] == 274].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
31536,274,13,4,0,0,0,1,0,0,1,...,150,1,2019-07-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,8,2020-03-01
39868,274,9,3,0,1,0,0,2,2,3,...,6350,4,2020-06-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,9,2021-03-01
35987,274,9,4,0,0,0,0,0,1,3,...,0,0,2020-01-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,2,2020-03-01
33005,274,9,3,0,0,0,1,0,1,0,...,0,6,2019-09-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,6,2020-03-01
34531,274,9,4,0,0,0,0,0,4,2,...,1103,5,2019-11-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,4,2020-03-01
40614,274,9,2,1,1,0,1,2,1,2,...,2100,1,2020-07-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,8,2021-03-01
39112,274,9,4,0,0,0,0,2,1,3,...,405,0,2020-05-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,10,2021-03-01
23277,274,13,4,0,0,0,0,1,3,1,...,0,0,2018-09-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,6,2019-03-01


In [98]:
df_master_clean.loc[(df_master_clean["user_ID"] == 274) & (df_master_clean["chapter_ID"] == 13)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
29031,274,13,2,2,0,0,0,2,1,3,...,2420,0,2019-04-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,11,2020-03-01
29875,274,13,5,0,0,0,0,3,4,1,...,380,2,2019-05-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,10,2020-03-01
30699,274,13,3,0,0,0,1,1,1,1,...,3336,0,2019-06-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,9,2020-03-01
31536,274,13,4,0,0,0,1,0,0,1,...,150,1,2019-07-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,8,2020-03-01
32352,274,13,3,0,0,0,0,2,0,0,...,0,0,2019-08-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,-1,7,2020-03-01


In [99]:
df_master_clean.loc[(df_master_clean["user_ID"] == 274) & (df_master_clean["chapter_ID"] == 9)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
33005,274,9,3,0,0,0,1,0,1,0,...,0,6,2019-09-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,6,2020-03-01
33785,274,9,4,0,0,0,1,0,3,5,...,0,2,2019-10-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,5,2020-03-01
34531,274,9,4,0,0,0,0,0,4,2,...,1103,5,2019-11-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,4,2020-03-01
35262,274,9,2,1,0,0,0,1,6,1,...,2253,1,2019-12-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,3,2020-03-01
35987,274,9,4,0,0,0,0,0,1,3,...,0,0,2020-01-01,"Finance & Insurance, Property & Casualty Insur...",2019-09-01,2021-03-01,2019-08-14,0,2,2020-03-01


In [100]:
df_database.loc[df_database["user_ID"] == 274, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1584,2019-09-01
3050,2018-02-01


In [101]:
df_master_clean.loc[df_master_clean["user_ID"] == 274, "join_date"] = pd.Timestamp("2018-02-01")

In [102]:
df_master_clean.loc[df_master_clean["user_ID"] == 274, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [103]:
df_master_clean.loc[df_master_clean["user_ID"] == 274, "drop_date"] = pd.NaT

#### - User 2692 

In [104]:
df_master_clean.loc[df_master_clean["user_ID"] == 2692].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
26839,2692,24,3,0,0,0,0,1,5,4,...,100,4,2019-01-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,1,2019-02-01
29366,2692,24,4,0,0,0,0,3,3,2,...,1962,14,2019-04-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,10,2020-02-01
21148,2692,20,4,0,0,0,0,0,4,0,...,1393,4,2018-06-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,8,2019-02-01
25902,2692,20,3,0,0,0,0,0,3,0,...,727,3,2018-12-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,2,2019-02-01
31033,2692,24,2,0,2,0,0,2,2,0,...,1663,5,2019-06-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,8,2020-02-01
16375,2692,20,1,0,1,0,0,1,1,0,...,150,1,2017-12-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-2,2,2018-02-01
33455,2692,24,3,0,1,0,0,0,5,0,...,786,7,2019-09-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,5,2020-02-01
14005,2692,20,3,0,1,0,0,3,4,1,...,375,12,2017-09-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-2,5,2018-02-01


In [105]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2692) & (df_master_clean["chapter_ID"] == 20)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
23508,2692,20,3,0,1,0,0,1,4,2,...,410,13,2018-09-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,5,2019-02-01
24309,2692,20,2,1,1,0,0,1,3,0,...,50,1,2018-10-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,4,2019-02-01
25086,2692,20,3,0,0,0,1,2,4,3,...,1645,8,2018-11-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,3,2019-02-01
25902,2692,20,3,0,0,0,0,0,3,0,...,727,3,2018-12-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,2,2019-02-01
26720,2692,20,2,0,0,0,0,2,2,4,...,996,4,2019-01-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,1,2019-02-01


In [106]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2692) & (df_master_clean["chapter_ID"] == 24)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
26839,2692,24,3,0,0,0,0,1,5,4,...,100,4,2019-01-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,-1,1,2019-02-01
27669,2692,24,3,1,0,0,0,2,3,2,...,2437,5,2019-02-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,12,2020-02-01
28523,2692,24,2,0,0,0,2,1,1,3,...,435,4,2019-03-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,11,2020-02-01
29366,2692,24,4,0,0,0,0,3,3,2,...,1962,14,2019-04-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,10,2020-02-01
30207,2692,24,5,0,0,0,0,1,7,1,...,1909,17,2019-05-01,"Marketing, Graphic Designer",2019-02-01,2020-02-01,2020-02-01,0,9,2020-02-01


In [107]:
df_master_clean.loc[[26839], df_master_clean.columns[2:15]] = df_master_clean.loc[26839, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[26720, df_master_clean.columns[2:15]].values
df_master_clean.drop(26720, axis=0, inplace=True)

In [108]:
df_database.loc[df_database["user_ID"] == 2692, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2635,2019-02-01
2971,2017-08-01


In [109]:
df_master_clean.loc[df_master_clean["user_ID"] == 2692, "join_date"] = pd.Timestamp("2017-08-01")

In [110]:
df_master_clean.loc[df_master_clean["user_ID"] == 2692, "palms_date"].max()

Timestamp('2020-02-01 00:00:00')

In [111]:
df_master_clean.loc[df_master_clean["user_ID"] == 2692, "drop_date"] = pd.Timestamp("2020-02-01")

#### - User 889 

In [112]:
df_master_clean.loc[df_master_clean["user_ID"] == 889].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
30768,889,16,3,0,0,0,0,0,1,0,...,0,2,2019-06-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,11,2020-05-01
38491,889,16,5,0,0,0,0,1,2,0,...,150,6,2020-04-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,1,2020-05-01
23771,889,3,4,0,0,0,0,0,2,0,...,0,8,2018-10-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-1,7,2019-05-01
33181,889,16,4,0,0,0,0,1,3,0,...,155,11,2019-09-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,8,2020-05-01
35408,889,16,2,0,1,0,1,1,1,1,...,30,3,2019-12-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,5,2020-05-01
15924,889,3,3,0,0,0,0,0,1,0,...,1250,19,2017-12-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-2,5,2018-05-01
37705,889,16,3,0,0,0,0,0,1,0,...,750,2,2020-03-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,2,2020-05-01
13538,889,3,4,0,0,0,0,1,1,0,...,0,12,2017-09-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-2,8,2018-05-01


In [113]:
df_master_clean.loc[(df_master_clean["user_ID"] == 889) & (df_master_clean["chapter_ID"] == 3)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
22960,889,3,3,0,1,0,0,0,1,0,...,220,9,2018-09-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-1,8,2019-05-01
23771,889,3,4,0,0,0,0,0,2,0,...,0,8,2018-10-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-1,7,2019-05-01
24575,889,3,4,0,0,0,0,1,0,0,...,0,3,2018-11-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-1,6,2019-05-01
25355,889,3,1,0,2,0,0,3,0,0,...,0,7,2018-12-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-1,5,2019-05-01
26167,889,3,2,1,0,0,0,0,1,1,...,1405,0,2019-01-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,-1,4,2019-05-01


In [114]:
df_master_clean.loc[(df_master_clean["user_ID"] == 889) & (df_master_clean["chapter_ID"] == 16)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
30768,889,16,3,0,0,0,0,0,1,0,...,0,2,2019-06-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,11,2020-05-01
31611,889,16,4,1,0,0,0,1,4,0,...,3434,5,2019-07-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,10,2020-05-01
32409,889,16,3,1,0,0,0,0,2,0,...,0,7,2019-08-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,9,2020-05-01
33181,889,16,4,0,0,0,0,1,3,0,...,155,11,2019-09-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,8,2020-05-01
33957,889,16,5,0,0,0,0,0,2,0,...,0,11,2019-10-01,"Accounting, Accountant",2019-05-01,2021-05-01,2020-08-07,0,7,2020-05-01


In [115]:
df_database.loc[df_database["user_ID"] == 889, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2076,2019-05-01
2805,2017-08-01


In [116]:
df_master_clean.loc[df_master_clean["user_ID"] == 889, "join_date"] = pd.Timestamp("2017-08-01")

In [117]:
df_master_clean.loc[df_master_clean["user_ID"] == 889, "palms_date"].max()

Timestamp('2020-08-01 00:00:00')

In [118]:
df_master_clean.loc[df_master_clean["user_ID"] == 889, "drop_date"] = pd.Timestamp("2020-08-01")

#### - User 1997 

In [119]:
df_master_clean.loc[df_master_clean["user_ID"] == 1997].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32154,1997,8,3,0,0,0,0,0,13,0,...,1144,0,2019-08-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,3,2019-11-01
34477,1997,8,4,0,0,0,0,0,6,1,...,234,5,2019-11-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,12,2020-11-01
7483,1997,9,2,0,0,0,2,0,0,0,...,0,0,2017-01-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-3,10,2017-11-01
12047,1997,9,4,0,0,0,0,0,2,0,...,0,1,2017-07-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-2,4,2017-11-01
35926,1997,8,4,0,0,0,0,0,1,1,...,468,0,2020-01-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,10,2020-11-01
3033,1997,9,3,0,0,0,0,0,1,0,...,56,1,2016-07-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-3,4,2016-11-01
38265,1997,8,5,0,0,0,0,0,2,4,...,540,0,2020-04-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,7,2020-11-01
882,1997,9,4,0,0,0,0,0,0,0,...,385,4,2016-04-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-4,7,2016-11-01


In [120]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1997) & (df_master_clean["chapter_ID"] == 9)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
9679,1997,9,4,0,0,0,0,0,1,0,...,0,1,2017-04-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-3,7,2017-11-01
10402,1997,9,4,0,0,0,0,0,1,0,...,0,1,2017-05-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-3,6,2017-11-01
11223,1997,9,4,0,0,0,1,0,28,0,...,0,0,2017-06-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-3,5,2017-11-01
12047,1997,9,4,0,0,0,0,0,2,0,...,0,1,2017-07-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-2,4,2017-11-01
12851,1997,9,1,1,0,0,0,0,1,4,...,0,0,2017-08-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,-2,3,2017-11-01


In [121]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1997) & (df_master_clean["chapter_ID"] == 8)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32154,1997,8,3,0,0,0,0,0,13,0,...,1144,0,2019-08-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,3,2019-11-01
32938,1997,8,4,0,0,0,0,0,10,3,...,729,0,2019-09-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,2,2019-11-01
33719,1997,8,5,0,0,0,0,1,5,0,...,1129,0,2019-10-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,1,2019-11-01
34477,1997,8,4,0,0,0,0,0,6,1,...,234,5,2019-11-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,12,2020-11-01
35205,1997,8,4,0,0,0,0,0,0,4,...,1507,0,2019-12-01,"Coach, Business Coach",2019-07-01,2020-11-01,2020-07-02,0,11,2020-11-01


This here is a case when a member quit and rejoined a few years later.

In [122]:
df_database.loc[df_database["user_ID"] == 1997, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
441,2019-07-01
1451,2011-02-01


In [123]:
df_master_clean.loc[df_master_clean["user_ID"] == 1997, "join_date"] = pd.Timestamp("2011-02-01")

In [124]:
df_master_clean.loc[df_master_clean["user_ID"] == 1997, "palms_date"].max()

Timestamp('2020-09-01 00:00:00')

In [125]:
df_master_clean.loc[df_master_clean["user_ID"] == 1997, "drop_date"] =  pd.Timestamp("2020-09-01")

#### - User 540 

In [126]:
df_master_clean.loc[df_master_clean["user_ID"] == 540].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
2901,540,3,3,0,0,0,1,1,0,0,...,39,0,2016-07-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,-1,7,2017-02-01
5840,540,3,5,0,0,0,0,0,3,0,...,3367,2,2016-11-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,-1,3,2017-02-01
43256,540,25,5,0,0,0,0,3,4,0,...,2938,7,2020-10-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,3,4,2021-02-01
11890,540,3,2,0,0,0,0,0,1,0,...,0,0,2017-07-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,7,2018-02-01
43978,540,25,4,0,0,0,0,1,4,0,...,11459,12,2020-11-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,3,3,2021-02-01
27711,540,25,3,0,0,0,1,1,4,0,...,15856,3,2019-02-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,1,12,2020-02-01
22047,540,25,2,1,0,0,1,2,1,0,...,818,2,2018-07-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,1,7,2019-02-01
13361,540,25,5,0,0,0,0,1,5,0,...,2491,4,2017-08-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,6,2018-02-01


In [127]:
df_master_clean.loc[(df_master_clean["user_ID"] == 540) & (df_master_clean["chapter_ID"] == 3)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
8783,540,3,4,0,0,0,0,2,5,0,...,350,3,2017-03-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,-1,11,2018-02-01
9522,540,3,4,0,0,0,0,0,6,0,...,400,2,2017-04-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,-1,10,2018-02-01
10247,540,3,3,1,1,0,0,0,6,0,...,0,7,2017-05-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,-1,9,2018-02-01
11068,540,3,1,3,0,0,0,1,4,1,...,0,0,2017-06-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,-1,8,2018-02-01
11890,540,3,2,0,0,0,0,0,1,0,...,0,0,2017-07-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,7,2018-02-01


In [128]:
df_master_clean.loc[(df_master_clean["user_ID"] == 540) & (df_master_clean["chapter_ID"] == 25)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
12516,540,25,2,1,0,0,1,1,0,0,...,0,0,2017-07-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,7,2018-02-01
13361,540,25,5,0,0,0,0,1,5,0,...,2491,4,2017-08-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,6,2018-02-01
14177,540,25,3,0,0,0,1,1,0,0,...,1748,0,2017-09-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,5,2018-02-01
14991,540,25,3,0,0,0,1,1,3,0,...,2250,4,2017-10-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,4,2018-02-01
15770,540,25,5,0,0,0,0,3,6,0,...,245,3,2017-11-01,"Legal, Lawyer Real Estate",2017-07-01,2021-02-01,NaT,0,3,2018-02-01


In [129]:
df_master_clean.loc[[12516], df_master_clean.columns[2:15]] = df_master_clean.loc[12516, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[11890, df_master_clean.columns[2:15]].values
df_master_clean.drop(11890, axis=0, inplace=True)

In [130]:
df_database.loc[df_database["user_ID"] == 540, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1143,2017-07-01
2772,2016-03-01


In [131]:
df_master_clean.loc[df_master_clean["user_ID"] == 540, "join_date"] = pd.Timestamp("2016-03-01")

In [132]:
df_master_clean.loc[df_master_clean["user_ID"] == 540, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [133]:
df_master_clean.loc[df_master_clean["user_ID"] == 540, "drop_date"] = pd.NaT

#### - User 603 

In [134]:
df_master_clean.loc[df_master_clean["user_ID"] == 603].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
39078,603,8,0,0,0,4,0,0,0,0,...,0,0,2020-05-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,7,2020-12-01
27436,603,16,4,0,0,0,0,2,7,3,...,2971,7,2019-02-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,10,2019-12-01
42819,603,8,4,1,0,0,0,0,8,2,...,6400,7,2020-10-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,2,2020-12-01
43551,603,8,2,2,0,0,0,1,2,0,...,1501,3,2020-11-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,1,2020-12-01
26595,603,16,4,0,0,0,0,4,23,0,...,7414,11,2019-01-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-2,11,2019-12-01
45777,603,8,3,0,0,0,0,0,6,0,...,452,4,2021-02-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,1,10,2021-12-01
35430,603,16,0,1,0,0,0,0,0,0,...,0,0,2019-12-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,12,2020-12-01
36699,603,8,3,0,0,0,0,1,8,1,...,3165,2,2020-02-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,10,2020-12-01


In [135]:
df_master_clean.loc[(df_master_clean["user_ID"] == 603) & (df_master_clean["chapter_ID"] == 16)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32431,603,16,4,0,0,0,0,3,7,0,...,5465,3,2019-08-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,4,2019-12-01
33206,603,16,4,0,0,0,0,1,3,1,...,8967,8,2019-09-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,3,2019-12-01
33976,603,16,4,0,0,0,1,1,10,0,...,13190,6,2019-10-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,2,2019-12-01
34704,603,16,0,2,0,0,2,0,0,0,...,11937,2,2019-11-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,1,2019-12-01
35430,603,16,0,1,0,0,0,0,0,0,...,0,0,2019-12-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,-1,12,2020-12-01


In [136]:
df_master_clean.loc[(df_master_clean["user_ID"] == 603) & (df_master_clean["chapter_ID"] == 8)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36699,603,8,3,0,0,0,0,1,8,1,...,3165,2,2020-02-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,10,2020-12-01
37514,603,8,3,0,0,0,0,0,5,1,...,3790,2,2020-03-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,9,2020-12-01
38298,603,8,2,0,0,3,0,0,0,0,...,0,0,2020-04-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,8,2020-12-01
39078,603,8,0,0,0,4,0,0,0,0,...,0,0,2020-05-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,7,2020-12-01
39834,603,8,3,0,0,1,0,0,0,0,...,0,0,2020-06-01,"Construction, Builder/General Contractor",2020-02-01,2021-12-01,2020-06-11,0,6,2020-12-01


In [137]:
df_database.loc[df_database["user_ID"] == 603, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
451,2020-02-01
2066,2018-10-01


In [138]:
df_master_clean.loc[df_master_clean["user_ID"] == 603, "join_date"] = pd.Timestamp("2018-10-01")

In [139]:
df_master_clean.loc[df_master_clean["user_ID"] == 603, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [140]:
df_master_clean.loc[df_master_clean["user_ID"] == 603, "drop_date"] = pd.NaT

#### - User 1417 

In [141]:
df_master_clean.loc[df_master_clean["user_ID"] == 1417].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
16685,1417,3,5,0,0,0,0,0,3,0,...,0,4,2018-01-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-2,1,2018-02-01
11887,1417,3,4,0,0,0,0,2,0,1,...,0,2,2017-07-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-2,7,2018-02-01
12700,1417,3,5,0,0,0,0,0,2,0,...,11345,6,2017-08-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-2,6,2018-02-01
19070,1417,3,3,1,0,0,0,3,1,0,...,6995,4,2018-04-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-1,10,2019-02-01
20650,1417,3,3,0,0,0,0,0,2,0,...,0,2,2018-06-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-1,8,2019-02-01
14338,1417,3,4,1,0,0,0,2,1,0,...,0,1,2017-10-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-2,4,2018-02-01
15142,1417,3,4,0,0,0,0,0,2,0,...,28,7,2017-11-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-2,3,2018-02-01
11065,1417,3,3,0,0,0,0,0,0,0,...,0,0,2017-06-01,"Trades, Painter",2019-02-01,2020-02-01,NaT,-2,8,2018-02-01


In [142]:
df_database.loc[df_database["user_ID"] == 1417, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1356,2019-02-01
2801,2017-06-01


In [143]:
df_master_clean.loc[df_master_clean["user_ID"] == 1417, "join_date"] = pd.Timestamp("2017-06-01")

In [144]:
df_master_clean.loc[df_master_clean["user_ID"] == 1417, "palms_date"].max()

Timestamp('2018-06-01 00:00:00')

In [145]:
df_master_clean.loc[df_master_clean["user_ID"] == 1417, "drop_date"] = pd.Timestamp("2018-06-01")

#### - User 28 

In [146]:
df_master_clean.loc[df_master_clean["user_ID"] == 28].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
30196,28,24,3,0,0,0,2,2,0,1,...,56,7,2019-05-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,7,2019-12-01
34210,28,24,5,0,0,0,0,0,5,0,...,0,5,2019-10-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,2,2019-12-01
21445,28,3,4,0,0,0,1,0,3,0,...,0,0,2018-07-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,5,2018-12-01
33445,28,24,4,0,0,0,0,0,4,0,...,0,1,2019-09-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,3,2019-12-01
41744,28,24,4,0,0,0,0,2,0,0,...,0,2,2020-08-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,1,4,2020-12-01
40999,28,24,5,0,0,0,0,1,2,1,...,0,1,2020-07-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,1,5,2020-12-01
22962,28,3,4,0,0,0,0,0,1,0,...,0,0,2018-09-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,3,2018-12-01
17507,28,3,4,0,0,0,0,1,4,0,...,0,4,2018-02-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,10,2018-12-01


In [147]:
df_master_clean.loc[(df_master_clean["user_ID"] == 28) & (df_master_clean["chapter_ID"] == 3)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
21445,28,3,4,0,0,0,1,0,3,0,...,0,0,2018-07-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,5,2018-12-01
22193,28,3,4,0,0,0,0,0,0,0,...,0,0,2018-08-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,4,2018-12-01
22962,28,3,4,0,0,0,0,0,1,0,...,0,0,2018-09-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,3,2018-12-01
23773,28,3,4,0,0,0,0,0,1,1,...,0,5,2018-10-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,-1,2,2018-12-01
24577,28,3,3,1,0,0,0,0,0,0,...,0,0,2018-11-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,1,2018-12-01


In [148]:
df_master_clean.loc[(df_master_clean["user_ID"] == 28) & (df_master_clean["chapter_ID"] == 24)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
26008,28,24,3,0,0,0,0,0,1,1,...,0,0,2018-12-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,12,2019-12-01
26828,28,24,4,0,0,0,0,0,4,1,...,0,3,2019-01-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,11,2019-12-01
27658,28,24,4,0,0,0,0,2,2,1,...,130,4,2019-02-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,10,2019-12-01
28512,28,24,4,0,0,0,0,1,0,1,...,0,4,2019-03-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,9,2019-12-01
29355,28,24,4,0,0,0,0,0,0,0,...,0,5,2019-04-01,"Computer & Programming, IT & Networks",2018-11-01,2021-12-01,NaT,0,8,2019-12-01


In [149]:
df_database.loc[df_database["user_ID"] == 28, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2633,2018-11-01
2810,2017-12-01


In [150]:
df_master_clean.loc[df_master_clean["user_ID"] == 28, "join_date"] = pd.Timestamp("2017-12-01")

In [151]:
df_master_clean.loc[df_master_clean["user_ID"] == 28, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [152]:
df_master_clean.loc[df_master_clean["user_ID"] == 28, "drop_date"] = pd.NaT

#### - User 2793 

In [153]:
df_master_clean.loc[df_master_clean["user_ID"] == 2793].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32403,2793,16,2,0,0,0,1,0,0,7,...,400,0,2019-08-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,3,2019-11-01
39267,2793,16,0,1,0,0,0,0,0,0,...,0,0,2020-05-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,6,2020-11-01
33952,2793,16,4,1,0,0,0,5,6,3,...,7527,5,2019-10-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,1,2019-11-01
33175,2793,16,2,0,0,0,2,8,8,4,...,138,0,2019-09-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,2,2019-11-01
35403,2793,16,3,1,0,0,0,0,1,4,...,147,40,2019-12-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,11,2020-11-01
36136,2793,16,4,0,0,0,0,4,5,1,...,0,34,2020-01-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,10,2020-11-01
42166,2793,11,2,0,3,0,0,0,2,6,...,2178,0,2020-09-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,2,2020-11-01
42910,2793,11,1,0,3,0,0,13,11,3,...,2816,53,2020-10-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,1,2020-11-01


In [154]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2793) & (df_master_clean["chapter_ID"] == 16)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36136,2793,16,4,0,0,0,0,4,5,1,...,0,34,2020-01-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,10,2020-11-01
36878,2793,16,3,0,0,0,1,1,0,2,...,222,1,2020-02-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,9,2020-11-01
37700,2793,16,1,0,2,0,0,8,7,0,...,700,29,2020-03-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,8,2020-11-01
38486,2793,16,4,1,0,0,0,8,5,1,...,1962,12,2020-04-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,-1,7,2020-11-01
39267,2793,16,0,1,0,0,0,0,0,0,...,0,0,2020-05-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,6,2020-11-01


In [155]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2793) & (df_master_clean["chapter_ID"] == 11)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
39160,2793,11,4,0,0,0,0,3,4,4,...,3472,45,2020-05-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,6,2020-11-01
39921,2793,11,3,0,1,0,0,4,9,1,...,20,29,2020-06-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,5,2020-11-01
40668,2793,11,2,1,1,0,0,0,4,4,...,685,10,2020-07-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,4,2020-11-01
41424,2793,11,4,0,0,0,0,0,6,5,...,801,25,2020-08-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,3,2020-11-01
42166,2793,11,2,0,3,0,0,0,2,6,...,2178,0,2020-09-01,"Health & Wellness, Massage Therapist",2020-05-01,2021-11-01,2020-05-03,0,2,2020-11-01


In [156]:
df_master_clean.loc[[39160], df_master_clean.columns[2:15]] = df_master_clean.loc[39160, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[39267, df_master_clean.columns[2:15]].values
df_master_clean.drop(39267, axis=0, inplace=True)

In [157]:
df_database.loc[df_database["user_ID"] == 2793, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
754,2020-05-01
2088,2019-08-01


In [158]:
df_master_clean.loc[df_master_clean["user_ID"] == 2793, "join_date"] = pd.Timestamp("2019-08-01")

In [159]:
df_master_clean.loc[df_master_clean["user_ID"] == 2793, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [160]:
df_master_clean.loc[df_master_clean["user_ID"] == 2793, "drop_date"] = pd.NaT

#### - User 2752

In [161]:
df_master_clean.loc[df_master_clean["user_ID"] == 2752].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
42184,2752,11,1,0,0,0,0,0,0,0,...,0,0,2020-09-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,8,2021-05-01
38066,2752,28,4,0,0,0,0,4,18,6,...,724,16,2020-03-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,2,2020-05-01
38853,2752,28,5,0,0,0,0,1,1,1,...,636,20,2020-04-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,1,2020-05-01
44379,2752,11,4,0,0,0,0,2,2,1,...,4039,3,2020-12-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,0,5,2021-05-01
45890,2752,11,4,0,0,0,0,5,0,8,...,270,2,2021-02-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,0,3,2021-05-01
40370,2752,28,4,0,0,0,0,1,1,1,...,128,1,2020-06-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,11,2021-05-01
41126,2752,28,5,0,0,0,0,4,2,1,...,510,4,2020-07-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,10,2021-05-01
37284,2752,28,1,0,0,0,0,0,0,0,...,0,0,2020-02-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,3,2020-05-01


In [162]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2752) & (df_master_clean["chapter_ID"] == 28)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
38853,2752,28,5,0,0,0,0,1,1,1,...,636,20,2020-04-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,1,2020-05-01
39620,2752,28,4,0,0,0,0,0,2,0,...,0,0,2020-05-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,12,2021-05-01
40370,2752,28,4,0,0,0,0,1,1,1,...,128,1,2020-06-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,11,2021-05-01
41126,2752,28,5,0,0,0,0,4,2,1,...,510,4,2020-07-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,10,2021-05-01
41871,2752,28,3,0,0,0,0,0,0,0,...,0,0,2020-08-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,9,2021-05-01


In [163]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2752) & (df_master_clean["chapter_ID"] == 11)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
42184,2752,11,1,0,0,0,0,0,0,0,...,0,0,2020-09-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,-1,8,2021-05-01
42927,2752,11,4,0,0,0,0,4,3,1,...,0,1,2020-10-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,0,7,2021-05-01
43658,2752,11,3,0,0,0,0,5,0,0,...,1970,0,2020-11-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,0,6,2021-05-01
44379,2752,11,4,0,0,0,0,2,2,1,...,4039,3,2020-12-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,0,5,2021-05-01
45114,2752,11,4,0,0,0,0,4,0,4,...,3754,8,2021-01-01,"Health & Wellness, Chiropractor",2020-10-01,2022-05-01,2020-08-12,0,4,2021-05-01


In [164]:
df_database.loc[df_database["user_ID"] == 2752, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
761,2020-10-01
3296,2020-03-01


In [165]:
df_master_clean.loc[df_master_clean["user_ID"] == 2752, "join_date"] = pd.Timestamp("2020-03-01")

In [166]:
df_master_clean.loc[df_master_clean["user_ID"] == 2752, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [167]:
df_master_clean.loc[df_master_clean["user_ID"] == 2752, "drop_date"] = pd.NaT

#### - User 6 

In [168]:
df_master_clean.loc[df_master_clean["user_ID"] == 6].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32690,6,25,5,0,0,0,0,2,14,12,...,5852,90,2019-08-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,9,2020-05-01
38747,6,25,5,0,0,0,0,4,6,7,...,10061,132,2020-04-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,1,1,2020-05-01
35687,6,25,3,0,0,0,0,2,7,9,...,4644,46,2019-12-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,5,2020-05-01
31052,6,25,4,0,0,0,0,7,18,13,...,100,65,2019-06-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,11,2020-05-01
31893,6,25,2,0,0,2,0,2,16,8,...,284,35,2019-07-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,10,2020-05-01
42503,6,25,4,0,0,0,0,2,4,6,...,5984,136,2020-09-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,1,8,2021-05-01
23484,6,20,4,0,0,0,0,0,16,11,...,80,27,2018-09-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,-1,8,2019-05-01
26697,6,20,1,0,0,0,0,0,3,2,...,94,0,2019-01-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,4,2019-05-01


In [169]:
df_master_clean.loc[(df_master_clean["user_ID"] == 6) & (df_master_clean["chapter_ID"] == 20)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
23484,6,20,4,0,0,0,0,0,16,11,...,80,27,2018-09-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,-1,8,2019-05-01
24286,6,20,3,1,0,0,0,0,12,15,...,2286,19,2018-10-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,-1,7,2019-05-01
25063,6,20,3,0,0,0,1,5,5,8,...,283,20,2018-11-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,-1,6,2019-05-01
25879,6,20,2,0,0,0,1,2,4,3,...,80,21,2018-12-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,-1,5,2019-05-01
26697,6,20,1,0,0,0,0,0,3,2,...,94,0,2019-01-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,4,2019-05-01


In [170]:
df_master_clean.loc[(df_master_clean["user_ID"] == 6) & (df_master_clean["chapter_ID"] == 25)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
26857,6,25,4,0,0,0,0,19,27,8,...,2989,47,2019-01-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,4,2019-05-01
27697,6,25,4,0,0,0,0,5,27,12,...,5921,65,2019-02-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,3,2019-05-01
28543,6,25,4,0,0,0,0,2,30,5,...,1189,72,2019-03-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,2,2019-05-01
29386,6,25,4,0,0,0,0,6,8,25,...,342,71,2019-04-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,1,2019-05-01
30226,6,25,5,0,0,0,0,1,35,19,...,105,84,2019-05-01,"Automotive, Automotive Mechanical Service",2019-01-01,2021-05-01,2020-12-03,0,12,2020-05-01


In [171]:
df_master_clean.loc[[26857], df_master_clean.columns[2:15]] = df_master_clean.loc[26857, df_master_clean.columns[2:15]].values +\
    df_master_clean.loc[26697, df_master_clean.columns[2:15]].values
df_master_clean.drop(26697, axis=0, inplace=True)

In [172]:
df_database.loc[df_database["user_ID"] == 6, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1166,2019-01-01
1194,2020-12-01
2988,2018-05-01


In [173]:
df_master_clean.loc[df_master_clean["user_ID"] == 6, "join_date"] = pd.Timestamp("2018-05-01")

In [174]:
df_master_clean.loc[df_master_clean["user_ID"] == 6, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [175]:
df_master_clean.loc[df_master_clean["user_ID"] == 6, "drop_date"] = pd.NaT

#### - User 1260 

In [176]:
df_master_clean.loc[df_master_clean["user_ID"] == 1260].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
10550,1260,14,2,2,0,0,0,0,0,0,...,1551,0,2017-05-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,9,2018-02-01
11378,1260,14,3,0,0,0,1,0,4,0,...,674437,4,2017-06-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,8,2018-02-01
12198,1260,14,4,0,0,0,0,3,7,2,...,9845,5,2017-07-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,7,2018-02-01
30427,1260,5,2,1,0,0,1,0,3,0,...,786,2,2019-06-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,8,2020-02-01
32860,1260,5,1,0,0,0,0,0,0,0,...,0,0,2019-09-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,5,2020-02-01
13839,1260,14,0,2,0,0,0,0,0,0,...,267,0,2017-09-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,5,2018-02-01
27065,1260,5,1,0,0,0,1,5,0,1,...,918,2,2019-02-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-1,12,2020-02-01
27915,1260,5,4,0,0,0,0,2,4,0,...,3255,2,2019-03-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,11,2020-02-01


In [177]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1260) & (df_master_clean["chapter_ID"] == 14)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
10550,1260,14,2,2,0,0,0,0,0,0,...,1551,0,2017-05-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,9,2018-02-01
11378,1260,14,3,0,0,0,1,0,4,0,...,674437,4,2017-06-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,8,2018-02-01
12198,1260,14,4,0,0,0,0,3,7,2,...,9845,5,2017-07-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,7,2018-02-01
13005,1260,14,2,2,0,0,1,0,1,1,...,2419,1,2017-08-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,6,2018-02-01
13839,1260,14,0,2,0,0,0,0,0,0,...,267,0,2017-09-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-2,5,2018-02-01


In [178]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1260) & (df_master_clean["chapter_ID"] == 5)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
27065,1260,5,1,0,0,0,1,5,0,1,...,918,2,2019-02-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,-1,12,2020-02-01
27915,1260,5,4,0,0,0,0,2,4,0,...,3255,2,2019-03-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,11,2020-02-01
28758,1260,5,4,1,0,0,0,3,4,0,...,112398,6,2019-04-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,10,2020-02-01
29599,1260,5,4,0,0,0,0,1,2,0,...,7727,4,2019-05-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,9,2020-02-01
30427,1260,5,2,1,0,0,1,0,3,0,...,786,2,2019-06-01,"Trades, Electrician",2019-03-01,2020-02-01,2019-08-27,0,8,2020-02-01


In [179]:
df_database.loc[df_database["user_ID"] == 1260, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1868,2019-03-01
2636,2019-02-01
2668,2017-05-01


In [180]:
df_master_clean.loc[df_master_clean["user_ID"] == 1260, "join_date"] = pd.Timestamp("2017-05-01")

In [181]:
df_master_clean.loc[df_master_clean["user_ID"] == 1260, "palms_date"].max()

Timestamp('2019-09-01 00:00:00')

In [182]:
df_master_clean.loc[df_master_clean["user_ID"] == 1260, "drop_date"] = pd.Timestamp("2019-09-01")

#### - User 1084

In [183]:
df_master_clean.loc[df_master_clean["user_ID"] == 1084].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
23098,1084,8,4,0,0,0,0,2,3,0,...,261,10,2018-09-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,1,2018-10-01
27153,1084,8,3,1,0,0,0,3,8,1,...,340,10,2019-02-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,1,8,2019-10-01
15248,1084,8,5,0,0,0,0,2,9,3,...,1740,18,2017-11-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,11,2018-10-01
26324,1084,8,3,0,0,0,1,2,6,1,...,7080,5,2019-01-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,1,9,2019-10-01
35218,1084,8,4,0,0,0,0,1,0,0,...,2386,1,2019-12-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,2,10,2020-10-01
34490,1084,8,3,1,0,0,0,0,2,0,...,3417,9,2019-11-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,2,11,2020-10-01
16793,1084,8,3,0,0,0,0,3,9,1,...,0,16,2018-01-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,9,2018-10-01
11755,1084,30,4,0,0,0,0,3,1,0,...,1153,7,2017-06-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,4,2017-10-01


In [184]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1084) & (df_master_clean["chapter_ID"] == 30)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
10115,1084,30,4,0,0,0,0,0,2,0,...,80,2,2017-04-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,6,2017-10-01
10935,1084,30,5,0,0,0,0,0,7,0,...,1446,9,2017-05-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,5,2017-10-01
11755,1084,30,4,0,0,0,0,3,1,0,...,1153,7,2017-06-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,4,2017-10-01
12580,1084,30,4,0,0,0,0,0,2,0,...,2350,7,2017-07-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,3,2017-10-01
13421,1084,30,3,0,0,0,1,0,0,0,...,0,0,2017-08-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,2,2017-10-01


In [185]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1084) & (df_master_clean["chapter_ID"] == 8)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
13651,1084,8,1,0,0,0,0,0,0,0,...,0,0,2017-09-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,-1,1,2017-10-01
14456,1084,8,4,0,0,0,0,3,7,0,...,1501,9,2017-10-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,12,2018-10-01
15248,1084,8,5,0,0,0,0,2,9,3,...,1740,18,2017-11-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,11,2018-10-01
16025,1084,8,1,0,0,0,2,0,0,0,...,0,0,2017-12-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,10,2018-10-01
16793,1084,8,3,0,0,0,0,3,9,1,...,0,16,2018-01-01,"Media, Social Media",2017-10-01,2020-10-01,2020-05-14,0,9,2018-10-01


In [186]:
df_database.loc[df_database["user_ID"] == 1084, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
413,2017-10-01
1323,2017-03-01


In [187]:
df_master_clean.loc[df_master_clean["user_ID"] == 1084, "join_date"] = pd.Timestamp("2017-03-01")

In [188]:
df_master_clean.loc[df_master_clean["user_ID"] == 1084, "palms_date"].max()

Timestamp('2020-05-01 00:00:00')

In [189]:
df_master_clean.loc[df_master_clean["user_ID"] == 1084, "drop_date"] = pd.Timestamp("2020-05-01")

#### - User 479

In [190]:
df_master_clean.loc[df_master_clean["user_ID"] == 479].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
34,479,3,4,1,0,0,0,0,0,0,...,391,0,2016-03-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,8,2016-11-01
8513,479,22,3,0,0,0,1,0,4,1,...,405,7,2017-02-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,9,2017-11-01
1460,479,3,5,0,0,0,0,0,3,0,...,0,2,2016-05-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,6,2016-11-01
751,479,3,3,1,0,0,0,3,3,1,...,1820,3,2016-04-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,7,2016-11-01
2904,479,3,3,0,0,0,1,0,4,0,...,39,3,2016-07-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,4,2016-11-01
3613,479,3,4,0,0,0,0,0,2,1,...,220,3,2016-08-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,3,2016-11-01
9978,479,22,3,1,0,0,0,0,0,0,...,2839,5,2017-04-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,7,2017-11-01
10768,479,22,4,0,0,0,0,3,4,0,...,481,4,2017-05-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,6,2017-11-01


In [191]:
df_master_clean.loc[(df_master_clean["user_ID"] == 479) & (df_master_clean["chapter_ID"] == 3)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
751,479,3,3,1,0,0,0,3,3,1,...,1820,3,2016-04-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,7,2016-11-01
1460,479,3,5,0,0,0,0,0,3,0,...,0,2,2016-05-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,6,2016-11-01
2176,479,3,4,0,0,0,0,0,1,3,...,2815,1,2016-06-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,5,2016-11-01
2904,479,3,3,0,0,0,1,0,4,0,...,39,3,2016-07-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,4,2016-11-01
3613,479,3,4,0,0,0,0,0,2,1,...,220,3,2016-08-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,3,2016-11-01


In [192]:
df_master_clean.loc[(df_master_clean["user_ID"] == 479) & (df_master_clean["chapter_ID"] == 22)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
5573,479,22,2,0,0,0,0,0,0,0,...,0,0,2016-10-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,-1,1,2016-11-01
6304,479,22,4,0,0,0,0,0,0,2,...,277,0,2016-11-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,12,2017-11-01
7055,479,22,2,1,0,0,0,0,0,0,...,8870,0,2016-12-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,11,2017-11-01
7790,479,22,3,1,0,0,0,0,2,2,...,2085,20,2017-01-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,10,2017-11-01
8513,479,22,3,0,0,0,1,0,4,1,...,405,7,2017-02-01,"Legal, Paralegal",2016-11-01,2017-11-01,2017-07-17,0,9,2017-11-01


In [193]:
df_database.loc[df_database["user_ID"] == 479, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
150,2016-11-01
2770,2016-03-01


In [194]:
df_master_clean.loc[df_master_clean["user_ID"] == 479, "join_date"] = pd.Timestamp("2016-03-01")

In [195]:
df_master_clean.loc[df_master_clean["user_ID"] == 479, "palms_date"].max()

Timestamp('2017-08-01 00:00:00')

In [196]:
df_master_clean.loc[df_master_clean["user_ID"] == 479, "drop_date"] = pd.Timestamp("2017-08-01")

#### - User 2237 

In [197]:
df_master_clean.loc[df_master_clean["user_ID"] == 2237].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
40597,2237,9,5,0,0,0,0,0,1,1,...,2000,16,2020-07-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,1,11,2021-06-01
42835,2237,9,5,0,0,0,0,2,0,0,...,2246,4,2020-10-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,1,8,2021-06-01
3163,2237,16,4,0,0,0,0,0,0,0,...,0,0,2016-07-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-3,11,2017-06-01
2440,2237,16,5,0,0,0,0,1,1,4,...,10801,0,2016-06-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-3,12,2017-06-01
39851,2237,9,3,0,1,0,0,0,3,1,...,0,20,2020-06-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,1,12,2021-06-01
39095,2237,9,4,0,0,0,0,0,0,0,...,850,0,2020-05-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,1,2020-06-01
37530,2237,9,3,0,0,0,0,0,0,0,...,0,2,2020-03-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,3,2020-06-01
43567,2237,9,4,0,0,0,0,0,3,0,...,2073,10,2020-11-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,1,7,2021-06-01


In [198]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2237) & (df_master_clean["chapter_ID"] == 16)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
1018,2237,16,3,1,0,0,0,1,0,1,...,875,11,2016-04-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-4,2,2016-06-01
1726,2237,16,4,0,0,0,0,0,2,0,...,0,0,2016-05-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-4,1,2016-06-01
2440,2237,16,5,0,0,0,0,1,1,4,...,10801,0,2016-06-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-3,12,2017-06-01
3163,2237,16,4,0,0,0,0,0,0,0,...,0,0,2016-07-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-3,11,2017-06-01
3898,2237,16,3,0,0,0,0,0,0,0,...,0,0,2016-08-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,-3,10,2017-06-01


In [199]:
df_master_clean.loc[(df_master_clean["user_ID"] == 2237) & (df_master_clean["chapter_ID"] == 9)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
31380,2237,9,3,0,0,0,0,2,0,4,...,0,0,2019-07-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,11,2020-06-01
32207,2237,9,4,0,0,1,0,1,0,1,...,3000,5,2019-08-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,10,2020-06-01
32981,2237,9,2,1,0,0,1,0,0,3,...,0,0,2019-09-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,9,2020-06-01
33761,2237,9,2,1,2,0,0,1,0,0,...,3000,0,2019-10-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,8,2020-06-01
34517,2237,9,3,0,0,1,0,0,0,0,...,75,7,2019-11-01,"Advertising & Marketing, Social Media",2019-06-01,2021-06-01,2016-08-09,0,7,2020-06-01


In [200]:
df_database.loc[df_database["user_ID"] == 2237, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1579,2019-06-01
2017,2015-06-01
2290,2014-08-01


In [201]:
df_master_clean.loc[df_master_clean["user_ID"] == 2237, "join_date"] = pd.Timestamp("2014-08-01")

In [202]:
df_master_clean.loc[df_master_clean["user_ID"] == 2237, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [203]:
df_master_clean.loc[df_master_clean["user_ID"] == 2237, "drop_date"] = pd.NaT

#### - User 1580 

In [204]:
df_master_clean.loc[df_master_clean["user_ID"] == 1580].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
42578,1580,27,5,0,0,0,0,0,7,6,...,1000,10,2020-09-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,10,2021-07-01
44035,1580,27,3,0,0,0,0,0,4,2,...,0,5,2020-11-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,8,2021-07-01
38841,1580,28,1,1,0,0,0,0,0,0,...,660,0,2020-04-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,-1,3,2020-07-01
46299,1580,27,4,0,0,0,0,1,4,1,...,0,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,5,2021-07-01
41837,1580,27,4,0,0,0,0,1,2,1,...,1000,10,2020-08-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,11,2021-07-01
37272,1580,28,1,0,0,0,0,0,0,0,...,0,0,2020-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,-1,5,2020-07-01
45539,1580,27,4,0,0,0,0,2,4,3,...,123,6,2021-01-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,6,2021-07-01
38054,1580,28,4,0,0,0,0,6,0,2,...,1680,41,2020-03-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,-1,4,2020-07-01


In [205]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1580) & (df_master_clean["chapter_ID"] == 28)].tail()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
37272,1580,28,1,0,0,0,0,0,0,0,...,0,0,2020-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,-1,5,2020-07-01
38054,1580,28,4,0,0,0,0,6,0,2,...,1680,41,2020-03-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,-1,4,2020-07-01
38841,1580,28,1,1,0,0,0,0,0,0,...,660,0,2020-04-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,-1,3,2020-07-01


In [206]:
df_master_clean.loc[(df_master_clean["user_ID"] == 1580) & (df_master_clean["chapter_ID"] == 27)].head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
41090,1580,27,3,0,0,0,0,0,0,1,...,0,9,2020-07-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,12,2021-07-01
41837,1580,27,4,0,0,0,0,1,2,1,...,1000,10,2020-08-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,11,2021-07-01
42578,1580,27,5,0,0,0,0,0,7,6,...,1000,10,2020-09-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,10,2021-07-01
43315,1580,27,4,0,0,0,0,0,6,1,...,0,7,2020-10-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,9,2021-07-01
44035,1580,27,3,0,0,0,0,0,4,2,...,0,5,2020-11-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,8,2021-07-01


In [207]:
df_database.loc[df_database["user_ID"] == 1580, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
3170,2020-07-01
3303,2020-03-01
3326,2019-09-01


In [208]:
df_master_clean.loc[df_master_clean["user_ID"] == 1580, "join_date"] = pd.Timestamp("2019-09-01")

In [209]:
df_master_clean.loc[df_master_clean["user_ID"] == 1580, "palms_date"].max()

Timestamp('2021-02-01 00:00:00')

In [210]:
df_master_clean.loc[df_master_clean["user_ID"] == 1580, "drop_date"] = pd.NaT

### This TEST

In [211]:
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
0,733,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Food/Beverages, Caterer",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
2,1150,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Trades, Heating & A/C",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
3,414,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Insurance, Group Benefits Consultant",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
4,1721,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Marketing, Marketing Services",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
5,1919,1,4,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Real Estate, Real Estate Sales Representative ...",2016-04-01,2018-12-01,NaT,-1,9,2016-12-01


In [212]:
df_master_clean.shape[0]

29207

In [213]:
df_master_clean.drop_duplicates(subset=["user_ID", "chapter_ID", "palms_date"], inplace=True)
df_master_clean.shape[0]

29185

In [215]:
df_master_clean.to_csv("data/df_master_Cleaned_NoAgg.csv")

## Re-calculate membership year

## Delete unnecessary columns

## Aggregate 9-months data
Ensure sure that each groupby sum is aggregated for 9 months, not less.

In [38]:
df_9_months = df_master_clean.copy()

# df_9_months = df_9_months.loc[df_9_months["months_to_renewal"] >= 4]

# df_3_months.drop(["years_to_renewal", "months_to_renewal"], axis=1, inplace=True)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,years_to_renewal,next_renewal_date
37248,734,27,1,0,0,0,0,0,0,0,...,0,2020-02-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,-1,6,1,2020-08-01
38028,734,27,4,0,0,0,0,2,3,0,...,11,2020-03-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,0,5,1,2020-08-01
38803,822,27,5,0,0,0,0,3,0,2,...,0,2020-04-01,"Office, Commercial Painting",2019-10-01,2020-10-01,NaT,0,6,0,2020-10-01
38809,2904,27,5,0,0,0,0,0,2,0,...,7,2020-04-01,"Legal & Accounting, Bookkeeping",2020-03-01,2021-10-01,NaT,0,6,1,2020-10-01
38816,734,27,5,0,0,0,0,3,3,2,...,13,2020-04-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,0,4,1,2020-08-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46298,734,27,4,0,0,0,0,0,4,5,...,8,2021-02-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,0,6,0,2021-08-01
46299,1580,27,4,0,0,0,0,1,4,1,...,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,5,0,2021-07-01
46300,1580,27,4,0,0,0,0,1,4,1,...,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-09-28,0,5,0,2021-07-01
46301,1580,27,4,0,0,0,0,1,4,1,...,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,5,0,2021-07-01


In [21]:
# group by "renewal_date"
# The rest of the columns listed in groupby remain the same for the given time period.
test = df_3_months.copy()
test = test.groupby(["user_ID", "renewal_date"]).sum()
# test.reset_index(level=["renewal_date"], inplace=True)
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership
user_ID,renewal_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,2017-05-01,51,11,0,0,0,0,1,13,5,12,0,10,4503,1,3
2,2018-05-01,51,12,0,0,0,0,2,3,2,6,0,12,4027,4,6
2,2019-05-01,51,10,0,0,0,0,2,3,3,4,1,5,1445,2,9
2,2020-05-01,51,10,0,0,0,1,4,3,6,27,3,9,1873,9,12
2,2021-05-01,51,9,1,0,0,0,0,9,5,4,0,4,1000,1,15


In [125]:
test.loc[test["user_ID"] == 420]

Unnamed: 0_level_0,user_ID,chapter_ID,profession,membership_length,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU
renewal_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",17,4,1,0,0,0,1,1,0,0,0,1,3195,1
2017-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",18,9,1,0,0,3,1,2,2,1,0,16,7421,3
2018-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",19,11,1,0,0,0,9,8,1,8,1,15,21651,23
2019-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",20,9,1,0,0,2,1,8,1,18,1,7,3734,11
2020-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",21,11,0,0,0,0,2,14,0,8,1,7,1796,14
2021-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",22,7,0,0,0,0,2,11,2,7,1,9,3891,11
