# Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import date

# Data Import & join
### Datasets: PALMS
First ensure that none of the files are duplicated by checking control sums.

In [2]:
start_year = 2015
start_month = 1
n_files = 81

df_temp = pd.read_csv("data/Region_Summary_PALMS_Report_2015_01.csv", index_col=0, encoding="ISO-8859-1")
sum_previous = df_temp.sum()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
    
    df_temp = pd.read_csv(f"data/Region_Summary_PALMS_Report_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    sum_current = df_temp.sum()
    
    # If all of the column sums are the same, then sum of the boolean comparison on the left
    # will be equal to the number of columns seen on the right
    if (sum_previous == sum_current).sum() == sum_current.shape[0]:
        print("Duplicated readings")
        print(f"Current file: region-palms-report_{year}_{month}.csv")
#         break
    else:
        sum_previous = sum_current

The loop hasn't been broken for any of the instances which means that the files are not duplicate. At least they are not positioned month by month but it is even more unlikely that a duplicated file has been saved in a file where the month differs by more than one.

Just to double-check lets check the condition for the same file.

In [3]:
(sum_current == sum_current).sum() == sum_current.shape[0]

True

As expected - everything is working correctly. Lets import the files and concatenate them.

In [4]:
start_year = 2014
start_month = 12
n_files = 82

df_palms = pd.DataFrame()

for i in range(n_files):
    month = (start_month + i) % 12 + 1
    month = str(month)
    if len(month) == 1:
        month = "0" + month
    else:
        pass
    year = start_year + ((start_month + i) // 12)
#     print(f"region-palms-report_{year}_{month}.csv")
    
    df_temp = pd.read_csv(f"data/Region_Summary_PALMS_Report_{year}_{month}.csv", index_col=0, encoding="ISO-8859-1")
    df_temp["palms_date"] = date(year, int(month), 1)

    df_palms = pd.concat([df_palms, df_temp])

column_list = df_palms.columns.tolist()
column_list = column_list[-3:-1] + column_list[:-3] + [column_list[-1]]
df_palms = df_palms[column_list]

df_palms.reset_index(inplace=True, drop=True)
df_palms

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,palms_date
0,202,1,4,0,0,0,0,1,2,0,5,0,2,0,0,2015-01-01
1,1001,1,4,0,0,0,0,0,5,0,6,1,3,150,0,2015-01-01
2,1060,1,1,0,0,0,0,0,0,0,0,0,0,0,0,2015-01-01
3,702,1,4,0,0,0,0,0,4,0,0,0,2,0,0,2015-01-01
4,1634,1,3,0,0,1,0,1,0,0,5,0,1,0,0,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38708,3005,32,2,1,1,0,0,1,2,2,1,0,0,100,0,2021-10-01
38709,3015,32,4,0,0,0,0,3,5,18,11,2,2,356,1,2021-10-01
38710,294,32,1,0,0,3,0,0,0,2,0,0,0,0,0,2021-10-01
38711,617,32,3,1,0,0,0,7,0,1,3,0,0,524,0,2021-10-01


### Dataset: database

In [5]:
df_database = pd.read_csv("data/database_data.csv", index_col=0, encoding="ISO-8859-1")
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date
0,1034,23,"Medical, Chiropractor",,07/01/2004,12/01/2010
1,909,23,"Trades, Renovations-Remodeling",,02/01/2008,12/01/2008
2,2062,23,"Food/Beverages, Chef",,09/01/2010,12/01/2011
3,947,23,"Gifts, Gift Baskets",947.0,03/01/2007,04/07/2008
4,2311,23,"Mortgage, Mortgage Broker",,07/01/2007,07/01/2008
5,1536,23,"Real Estate Services, Residential Real Estate ...",,07/01/2007,04/01/2021
6,2486,23,"Insurance, Life,Health and Disability Insurance",,07/01/2007,11/01/2009
7,2322,23,"Financial, Investment Advisor",,07/01/2007,04/01/2009
8,878,23,"Health and Wellness, Massage Therapist",,07/01/2007,07/01/2008
9,753,23,"Insurance, General-Motor Insurance",947.0,10/01/2007,10/01/2008


In [6]:
df_database["join_date"] = pd.to_datetime(df_database["join_date"], format='%m/%d/%Y', errors='coerce')
df_database["renewal_date"] = pd.to_datetime(df_database["renewal_date"], format='%m/%d/%Y', errors='coerce')
# df_database["sponsor_ID"] = pd.to_numeric(df_database["sponsor_ID"], errors='coerce', downcast='Int32')
df_database.head(10)

Unnamed: 0,user_ID,chapter_ID,industry,sponsor_ID,join_date,renewal_date
0,1034,23,"Medical, Chiropractor",,2004-07-01,2010-12-01
1,909,23,"Trades, Renovations-Remodeling",,2008-02-01,2008-12-01
2,2062,23,"Food/Beverages, Chef",,2010-09-01,2011-12-01
3,947,23,"Gifts, Gift Baskets",947.0,2007-03-01,2008-04-07
4,2311,23,"Mortgage, Mortgage Broker",,2007-07-01,2008-07-01
5,1536,23,"Real Estate Services, Residential Real Estate ...",,2007-07-01,2021-04-01
6,2486,23,"Insurance, Life,Health and Disability Insurance",,2007-07-01,2009-11-01
7,2322,23,"Financial, Investment Advisor",,2007-07-01,2009-04-01
8,878,23,"Health and Wellness, Massage Therapist",,2007-07-01,2008-07-01
9,753,23,"Insurance, General-Motor Insurance",947.0,2007-10-01,2008-10-01


### Dataset: dropped_members

In [7]:
df_dropped = pd.read_csv("data/dropped_members.csv", index_col=0, encoding="ISO-8859-1")
df_dropped.head(10)

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637.0,9,Left Company,11/03/2021
1,1185.0,23,Changed Job,10/22/2021
2,1225.0,14,Takes Too Much Time,10/13/2021
3,2138.0,15,Did Not Renew - No reason Given,11/01/2021
4,1610.0,15,Other Reason (see notes),10/12/2021
5,2574.0,12,Other Reason (see notes),10/28/2021
6,629.0,21,Changed Job,10/28/2021
7,722.0,13,Takes Too Much Time,10/22/2021
8,2580.0,23,Not Enough Referrals,09/23/2021
9,2336.0,6,Scheduling Conflicts,10/20/2021


In [8]:
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2223 entries, 0 to 2237
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_ID     2220 non-null   float64
 1   chapter_ID  2223 non-null   int64  
 2   reason      1355 non-null   object 
 3   drop_date   2223 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 86.8+ KB


Seems like there are some missing values in ```user_ID``` column, which is odd. There shouldn't be any missings values in this table.

In [9]:
df_dropped[df_dropped["user_ID"].isna()]

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
884,,11,Personal Reasons,01/10/2018
1053,,31,Going back to School,03/28/2017
2147,,9,,01/06/2010


All those records are listed from a time period before the PALMS data that is being looked into, so those records can be dropped.

In [10]:
df_dropped.dropna(subset=["user_ID"], inplace=True)
df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2220 entries, 0 to 2237
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   user_ID     2220 non-null   float64
 1   chapter_ID  2220 non-null   int64  
 2   reason      1353 non-null   object 
 3   drop_date   2220 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 86.7+ KB


In [11]:
df_dropped["drop_date"] = pd.to_datetime(df_dropped["drop_date"], format='%m/%d/%Y', errors='coerce')
df_dropped["user_ID"] = pd.to_numeric(df_dropped["user_ID"], downcast='integer')
df_dropped.head()

Unnamed: 0,user_ID,chapter_ID,reason,drop_date
0,637,9,Left Company,2021-11-03
1,1185,23,Changed Job,2021-10-22
2,1225,14,Takes Too Much Time,2021-10-13
3,2138,15,Did Not Renew - No reason Given,2021-11-01
4,1610,15,Other Reason (see notes),2021-10-12


## Join data - create a master dataframe

In [12]:
df_master = df_palms.copy()
df_master = df_master.merge(df_database.drop("chapter_ID", axis=1), how="left", on="user_ID")
df_master = df_master.merge(df_dropped, how="left", on="user_ID")
df_master.drop(["chapter_ID_y", "reason"], axis=1, inplace=True)
df_master.rename({"chapter_ID_x": "chapter_ID"}, axis=1, inplace=True)
df_master.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,V,1-2-1,TYFCB,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date
10260,2231,11,5,0,0,0,0,0,0,2,...,0,5,0,0,2016-05-01,"Trades, Roofing",,2015-08-01,2020-03-01,2019-11-01
27211,259,32,4,0,0,0,0,2,2,0,...,0,5,203,0,2018-02-01,"Mortgage, Residential Mortgage",331.0,2010-11-01,2020-11-01,2020-10-31
11930,1453,19,4,0,0,0,0,0,3,0,...,0,4,8145,4,2016-07-01,"Training & Coaching, Business Training/Coach",,2018-05-01,2021-11-01,2017-07-18
28109,870,6,4,0,0,0,0,0,8,1,...,1,0,2078,1,2018-04-01,"Accounting, Tax Prepare",278.0,2016-02-01,2020-02-01,2019-06-11
1743,1245,10,3,0,2,0,0,2,4,0,...,0,3,2866,0,2015-04-01,"Health & Wellness, Chiropractor",,2020-01-01,2022-03-01,2021-07-21
12349,1987,9,4,0,0,0,0,0,4,0,...,0,2,2063,1,2016-08-01,"Telecommunications, Telecommunications Product...",766.0,2019-09-01,2022-03-01,2019-08-13
47959,1536,23,2,0,0,1,0,1,0,0,...,0,1,273,1,2020-03-01,"Real Estate Services, Residential Real Estate ...",,2007-07-01,2021-04-01,2020-12-18
20087,2574,12,3,1,0,0,0,6,4,1,...,0,2,2176,27,2017-06-01,"Construction, Commercial Builder",1365.0,2017-01-01,2022-06-01,2021-10-28
15449,2754,10,1,2,0,0,0,0,5,1,...,0,0,0,0,2016-12-01,"Insurance, General-Motor Insurance",1602.0,2015-08-01,2017-02-01,2017-01-01
55441,262,31,3,0,0,0,0,0,0,1,...,0,10,575,6,2020-12-01,"Advertising & Marketing, Web Design",521.0,2019-02-01,2022-02-01,NaT


In [13]:
df_master.shape

(63712, 21)

# Data cleaning & aggregation
## Remove duplicates
Check different variants, depending on columns selected - how many records are dropped in each variant.

In [14]:
df_master_clean = df_master.copy()
df_master_clean["palms_date"] = pd.to_datetime(df_master_clean["palms_date"], errors='coerce')
df_master_clean["renewal_date"] = pd.to_datetime(df_master_clean["renewal_date"], errors='coerce')

df_master_clean.shape[0]

63712

In [15]:
# variant 0
df_master_clean.drop_duplicates().shape[0]

62068

In [16]:
df_master_clean.columns

Index(['user_ID', 'chapter_ID', 'P', 'A', 'L', 'M', 'S', 'RGI', 'RGO', 'RRI',
       'RRO', 'V', '1-2-1', 'TYFCB', 'CEU', 'palms_date', 'industry',
       'sponsor_ID', 'join_date', 'renewal_date', 'drop_date'],
      dtype='object')

In [17]:
# variant 1
df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16]).shape[0]

38712

In [18]:
# variant 2
df_master_clean.drop_duplicates(subset=["user_ID", "chapter_ID", "palms_date"]).shape[0]

38683

In [19]:
# variant 3
df_master_clean.drop_duplicates(subset=["user_ID", "palms_date"]).shape[0]

38644

Lets go with **variant 1** which is more precise than **variant 0**. It seems that there are some mix-ups in the latter columns: 'profession', 'join_date', 'renewal_date', 'drop_date'

In [20]:
df_master_clean.drop_duplicates(subset=df_master_clean.columns[:16], inplace=True)
df_master_clean.shape

(38712, 21)

## Get relative renewal date for data aggregation

In [21]:
df_master_clean["year_of_membership"] = (df_master_clean["palms_date"] - df_master_clean["join_date"]) / np.timedelta64(1, 'M')
df_master_clean["year_of_membership"] = df_master_clean["year_of_membership"].round().astype(int) // 12
df_master_clean["months_to_renewal"] = (df_master_clean["renewal_date"] - df_master_clean["palms_date"]) / np.timedelta64(1, 'M')
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"].round().astype(int)
df_master_clean["years_to_renewal"] = (df_master_clean["months_to_renewal"] - 1) // 12
df_master_clean["months_to_renewal"] = df_master_clean["months_to_renewal"] % 12

# Substitute "months_to_renewal" value 0 with 12 for ease of aggregation later on
df_master_clean.loc[df_master_clean["months_to_renewal"] == 0, "months_to_renewal"] = 12

for index, row in df_master_clean.iterrows():
    df_master_clean.at[index, 'relative_renewal_date'] = row['renewal_date'] - pd.DateOffset(years=row['years_to_renewal'])

df_master_clean.drop(["years_to_renewal"], axis=1, inplace=True)
df_master_clean.sample(10, random_state=13)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
62155,1551,8,4,1,0,0,0,0,1,2,...,16,2021-09-01,"Food & Beverage, Caterer",2478.0,2018-12-01,2021-12-01,2020-12-01,2,3,2021-12-01
50999,2109,14,3,0,0,0,0,0,0,0,...,3,2020-07-01,"Advertising & Marketing, Sign Company",2998.0,2020-05-01,2021-05-01,2021-03-29,0,10,2021-05-01
58833,1325,8,2,0,0,2,0,1,0,2,...,1,2021-05-01,"Construction, Garage Doors",,2020-12-01,2021-12-01,2021-05-19,0,7,2021-12-01
44957,2134,32,4,0,0,0,0,2,2,1,...,4,2019-11-01,"Information Technology, IT Professional Services",3015.0,2019-08-01,2020-08-01,2020-02-27,0,9,2020-08-01
30359,2492,25,1,1,0,0,0,0,0,0,...,0,2018-06-01,"Cleaning, Commercial Cleaning",1192.0,2018-04-01,2019-04-01,2018-06-10,0,10,2019-04-01
21857,2045,18,3,1,0,0,1,1,1,0,...,2,2017-08-01,"Trades, Steel Fabricator",1614.0,2016-05-01,2018-05-01,2017-09-30,1,9,2018-05-01
47331,2983,29,1,0,0,0,0,0,0,0,...,0,2020-02-01,"Trades, General Contractor Residential",2823.0,2020-03-01,2021-03-01,2020-07-15,-1,1,2020-03-01
42157,2114,18,4,0,0,0,0,1,0,0,...,7,2019-08-01,"Financial, Bank Services",172.0,2019-03-01,2020-03-01,2019-09-01,0,7,2020-03-01
44169,1665,32,5,0,0,0,0,0,4,0,...,4,2019-10-01,"Real Estate Services, Residential Real Estate ...",1809.0,2018-08-01,2022-08-01,NaT,1,10,2020-08-01
33609,2841,24,3,0,0,0,0,0,0,0,...,0,2018-10-01,"Trades, General Contractor",97.0,2018-09-01,2019-09-01,2019-09-17,0,11,2019-09-01


## Ensure data Integrity
 1. Check if ```year_of_membership``` is a negative number
 2. Check drop dates
 3. Remove any remaining duplicate entries

Let's tackle the first item:

### 1. Check if ```year_of_membership``` is a negative number
Lets see which chapters contain members with a negative ```year_of_membership```.

In [22]:
print(sorted(df_master_clean.loc[df_master_clean["year_of_membership"] < 0, "chapter_ID"].unique()))

[1, 2, 4, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 24, 25, 26, 28, 29, 31, 32]


Seems as though the majority of the chapters have at least one records with a negative year of membership. In this case let's look into members who have any records with negative ```year_of_membership``` and try to fix them.

In [23]:
df_master_clean.loc[df_master_clean["year_of_membership"] < 0]

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
2,1060,1,1,0,0,0,0,0,0,0,...,0,2015-01-01,"Security, Security Services",782.0,2016-06-01,2017-09-01,2017-08-07,-2,8,2015-09-01
6,2347,1,3,1,0,0,0,0,1,0,...,0,2015-01-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-5,5,2015-06-01
11,2673,1,4,0,0,0,0,0,4,0,...,0,2015-01-01,"Real Estate, Real Estate Sales Representative ...",,2015-12-01,2018-05-01,2018-01-18,-1,4,2015-05-01
24,2838,1,4,0,0,0,0,0,10,0,...,0,2015-01-01,"Unknown, BNIW_Null",,2015-12-01,2016-12-01,2017-01-04,-1,11,2015-12-01
130,2092,10,4,0,0,0,0,1,0,0,...,0,2015-01-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-5,10,2015-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63202,788,14,1,0,0,0,0,0,1,0,...,0,2021-10-01,"Legal & Accounting, Bookkeeping",,2021-11-01,2022-11-01,NaT,-1,1,2021-11-01
63276,640,17,1,0,0,0,0,0,0,0,...,0,2021-10-01,"Real Estate Services, Property Management",,2021-11-01,2022-11-01,NaT,-1,1,2021-11-01
63289,304,17,1,0,0,0,0,0,0,0,...,0,2021-10-01,"Health & Wellness, Naturopaths",1932.0,2021-11-01,2022-11-01,NaT,-1,1,2021-11-01
63385,195,19,1,0,0,0,0,0,0,0,...,0,2021-10-01,"Construction, Carpenter",2883.0,2021-11-01,2022-11-01,NaT,-1,1,2021-11-01


In [24]:
# Get unique 'user_ID' of members who have at least one month of negative membership
df_master_clean.loc[df_master_clean["year_of_membership"] < 0, "user_ID"].unique()

array([1060, 2347, 2673, 2838, 2092, 2387, 2288,  215, 1641, 2176,  140,
       1795, 2854, 1248,  504, 1752,  943, 1407, 2356,  747,  786, 1257,
        507, 1372, 2046, 2378, 1210, 2754, 2529, 2256, 1703,  829, 2156,
       2567, 1087, 2091, 2150, 1291, 2148, 1751,  217, 2075, 1469, 2793,
        182, 1196, 1037, 1178, 1708, 1243, 2565, 2593, 1998, 2805,  800,
        231,  375,  580, 2879, 1903, 1129, 1767,  633, 3024, 1895, 1582,
        724, 1448, 1550, 1198, 2819, 2832, 1327,  773, 1207,  439, 1801,
       2011,  667, 2368,  336, 1331, 1959,  969, 1387,  781, 1130, 2204,
       1288, 2265, 1845, 2590,  570,  508, 2901,  300, 2056,  711, 1277,
        886, 2583, 2746, 2268, 1381,  482, 1450, 3052, 1193,  834,  117,
        862, 2814, 1724, 1226, 2045, 2049, 2782, 2697, 1021, 2856,  687,
       2467, 1137, 2816,  235, 1203, 2844,  471, 2312, 1916,  275, 2111,
       2930, 1088,  367, 2672, 1750, 1299,   10, 1737, 1490,  586,  694,
       2774, 2208,  818, 2638, 2120, 2622, 1332, 26

There seems to be many records that have a negative ```year_of_membership``` and it would be too tedious too look into every single one specifically. There might however be a pattern that the first one or two months of PALMS when the member joins is before the actual ```join_date```. Having this in mind, I recall that a member was allowed to start attending chapter meetings a little bit before he was entered into the system. Records with one or two months prior to the members' ```join_date``` aren't a problem as they can safely be dropped and excluded from the aggregation later on.

There is, however, a **problem if a member has more that two months of negative** ```year_of_membership``` because it might indicate a different situation than above. Let's create a new dataframe which counts how many PALMS records contain a negative ```year_of_membership``` per user.

In [25]:
df_negative = df_master_clean.copy()
df_negative["negative_months"] = 1
df_negative = df_negative.loc[df_negative["year_of_membership"] < 0].groupby("user_ID")[["negative_months"]].count()
df_negative

Unnamed: 0_level_0,negative_months
user_ID,Unnamed: 1_level_1
7,8
10,1
18,1
19,1
20,1
...,...
3044,1
3045,1
3052,1
3056,1


There are 460 users with at least one month with negative ```year_of_membership```.

### Members with two or less negative months
First lets look at a few random samples members who have 2 or less records with ```negative_months``` to ensure that they are indeed just a few months before the actual join date. If those those few members will indeed have just a month or two **before** their ```join_date``` then those months can be disregarded and dropped - this data should not be taken into account for the aggregation as it would produce incorrect results.

In [26]:
df_negative.loc[df_negative["negative_months"] <= 2].shape[0]

425

In [27]:
df_negative.loc[df_negative["negative_months"] <= 2].sample(10, random_state=13)

Unnamed: 0_level_0,negative_months
user_ID,Unnamed: 1_level_1
294,1
1452,1
246,1
1737,1
2590,1
10,1
535,1
1970,1
2930,1
2375,1


#### - User 294

In [28]:
df_master_clean.loc[df_master_clean["user_ID"] == 294].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
28859,294,32,1,0,0,0,0,0,0,0,...,0,2018-04-01,"Construction, Electrician",2978.0,2018-05-01,2022-05-01,NaT,-1,1,2018-05-01
29695,294,32,5,0,0,0,0,2,1,3,...,3,2018-05-01,"Construction, Electrician",2978.0,2018-05-01,2022-05-01,NaT,0,12,2019-05-01
30527,294,32,4,0,0,0,0,1,3,1,...,0,2018-06-01,"Construction, Electrician",2978.0,2018-05-01,2022-05-01,NaT,0,11,2019-05-01
31315,294,32,5,0,0,0,0,4,1,1,...,1,2018-07-01,"Construction, Electrician",2978.0,2018-05-01,2022-05-01,NaT,0,10,2019-05-01
32126,294,32,4,0,0,0,0,2,1,0,...,0,2018-08-01,"Construction, Electrician",2978.0,2018-05-01,2022-05-01,NaT,0,9,2019-05-01


Indeed there is one month where the ```year_of_membership``` is negative for user 294 and it is one month prior to the ```join_date```. Having a look at the "PALMS" section the member attended 1 meeting before joining the group in 2018-05-01. Those records will be eventually dropped in the cleaning and aggregation process later on.
#### - User 1452

In [29]:
df_master_clean.loc[df_master_clean["user_ID"] == 1452].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
54578,1452,28,1,0,0,0,0,0,0,0,...,4,2020-11-01,"Legal & Accounting, Certified Public Accountan...",698.0,2020-12-01,2022-06-01,2021-10-08,-1,7,2021-06-01
55377,1452,28,3,0,0,0,0,0,0,0,...,5,2020-12-01,"Legal & Accounting, Certified Public Accountan...",698.0,2020-12-01,2022-06-01,2021-10-08,0,6,2021-06-01
56202,1452,28,4,0,0,0,0,1,2,1,...,2,2021-01-01,"Legal & Accounting, Certified Public Accountan...",698.0,2020-12-01,2022-06-01,2021-10-08,0,5,2021-06-01
57020,1452,28,4,0,0,0,0,2,2,0,...,3,2021-02-01,"Legal & Accounting, Certified Public Accountan...",698.0,2020-12-01,2022-06-01,2021-10-08,0,4,2021-06-01
57845,1452,28,5,0,0,0,0,1,2,0,...,3,2021-03-01,"Legal & Accounting, Certified Public Accountan...",698.0,2020-12-01,2022-06-01,2021-10-08,0,3,2021-06-01


Same case as previous user.

#### - User 246 

In [30]:
df_master_clean.loc[df_master_clean["user_ID"] == 246].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
17924,246,23,1,0,0,0,0,0,0,0,...,0,2017-03-01,"Telecommunications, Telecommunications Services",,2017-04-01,2021-04-01,2020-12-15,-1,1,2017-04-01
18662,246,23,4,0,0,0,0,2,0,0,...,3,2017-04-01,"Telecommunications, Telecommunications Services",,2017-04-01,2021-04-01,2020-12-15,0,12,2018-04-01
19466,246,23,4,0,0,0,0,3,3,0,...,4,2017-05-01,"Telecommunications, Telecommunications Services",,2017-04-01,2021-04-01,2020-12-15,0,11,2018-04-01
20301,246,23,5,0,0,0,0,2,0,0,...,5,2017-06-01,"Telecommunications, Telecommunications Services",,2017-04-01,2021-04-01,2020-12-15,0,10,2018-04-01
21143,246,23,3,1,0,0,0,0,3,0,...,1,2017-07-01,"Telecommunications, Telecommunications Services",,2017-04-01,2021-04-01,2020-12-15,0,9,2018-04-01


#### - User 1737

In [31]:
df_master_clean.loc[df_master_clean["user_ID"] == 1737].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
13514,1737,25,1,0,1,0,0,0,0,1,...,0,2016-09-01,"Insurance, Property & Casualty Insurance",2448.0,2016-10-01,2018-10-01,2018-09-06,-1,1,2016-10-01
14282,1737,25,4,0,0,0,0,1,1,1,...,7,2016-10-01,"Insurance, Property & Casualty Insurance",2448.0,2016-10-01,2018-10-01,2018-09-06,0,12,2017-10-01
15023,1737,25,3,0,0,0,1,2,4,5,...,2,2016-11-01,"Insurance, Property & Casualty Insurance",2448.0,2016-10-01,2018-10-01,2018-09-06,0,11,2017-10-01
15786,1737,25,2,2,0,0,0,0,0,0,...,2,2016-12-01,"Insurance, Property & Casualty Insurance",2448.0,2016-10-01,2018-10-01,2018-09-06,0,10,2017-10-01
16532,1737,25,3,0,0,0,0,1,1,1,...,2,2017-01-01,"Insurance, Property & Casualty Insurance",2448.0,2016-10-01,2018-10-01,2018-09-06,0,9,2017-10-01


#### - User 2590

In [32]:
df_master_clean.loc[df_master_clean["user_ID"] == 2590].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
8606,2590,2,3,0,0,0,0,0,0,0,...,0,2016-03-01,"Insurance, Business Insurance",969.0,2016-04-01,2018-12-01,NaT,-1,9,2016-12-01
9338,2590,2,3,1,0,0,0,0,1,0,...,0,2016-04-01,"Insurance, Business Insurance",969.0,2016-04-01,2018-12-01,NaT,0,8,2016-12-01
10056,2590,2,3,1,0,0,0,4,0,0,...,25,2016-05-01,"Insurance, Business Insurance",969.0,2016-04-01,2018-12-01,NaT,0,7,2016-12-01
10779,2590,2,4,1,0,0,0,0,0,1,...,0,2016-06-01,"Insurance, Business Insurance",969.0,2016-04-01,2018-12-01,NaT,0,6,2016-12-01
11516,2590,2,2,0,0,0,1,0,3,0,...,0,2016-07-01,"Insurance, Business Insurance",969.0,2016-04-01,2018-12-01,NaT,0,5,2016-12-01


#### - User 10

In [33]:
df_master_clean.loc[df_master_clean["user_ID"] == 10].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
13400,10,19,1,0,0,0,0,0,0,0,...,0,2016-09-01,"Trades, Painter",1308.0,2016-10-01,2018-10-01,2018-09-18,-1,1,2016-10-01
14166,10,19,4,0,0,0,0,0,0,0,...,0,2016-10-01,"Trades, Painter",1308.0,2016-10-01,2018-10-01,2018-09-18,0,12,2017-10-01
14927,10,19,5,0,0,0,0,4,1,2,...,8,2016-11-01,"Trades, Painter",1308.0,2016-10-01,2018-10-01,2018-09-18,0,11,2017-10-01
15672,10,19,2,0,0,0,0,2,0,0,...,0,2016-12-01,"Trades, Painter",1308.0,2016-10-01,2018-10-01,2018-09-18,0,10,2017-10-01
16419,10,19,3,1,0,0,0,1,2,1,...,3,2017-01-01,"Trades, Painter",1308.0,2016-10-01,2018-10-01,2018-09-18,0,9,2017-10-01


#### - User 535

In [34]:
df_master_clean.loc[df_master_clean["user_ID"] == 535].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
62639,535,21,2,0,0,0,0,2,3,2,...,1,2021-09-01,"Training & Coaching, Business Training/Coach",1873.0,2021-10-01,2022-11-01,NaT,-1,2,2021-11-01
63454,535,21,4,0,0,0,0,3,4,2,...,5,2021-10-01,"Training & Coaching, Business Training/Coach",1873.0,2021-10-01,2022-11-01,NaT,0,1,2021-11-01


This example shows insufficient amount of records to perform calculations but those will be dropped later in the aggregation cleaning process.
#### - User 1970

In [35]:
df_master_clean.loc[df_master_clean["user_ID"] == 1970].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
24390,1970,19,2,0,0,0,0,0,2,0,...,0,2017-11-01,"Health & Wellness, Nutritionist",2958.0,2017-12-01,2022-12-01,NaT,-1,1,2017-12-01
25188,1970,19,3,0,0,0,0,2,0,0,...,6,2017-12-01,"Health & Wellness, Nutritionist",2958.0,2017-12-01,2022-12-01,NaT,0,12,2018-12-01
26026,1970,19,4,0,0,0,0,1,5,1,...,6,2018-01-01,"Health & Wellness, Nutritionist",2958.0,2017-12-01,2022-12-01,NaT,0,11,2018-12-01
26858,1970,19,4,0,0,0,0,1,7,2,...,6,2018-02-01,"Health & Wellness, Nutritionist",2958.0,2017-12-01,2022-12-01,NaT,0,10,2018-12-01
27671,1970,19,3,0,0,0,1,1,2,1,...,5,2018-03-01,"Health & Wellness, Nutritionist",2958.0,2017-12-01,2022-12-01,NaT,0,9,2018-12-01


#### - User 2930

In [36]:
df_master_clean.loc[df_master_clean["user_ID"] == 2930].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
12253,2930,4,2,0,0,0,0,0,0,0,...,0,2016-08-01,"Health and Wellness, Spa",1787.0,2016-09-01,2018-09-01,NaT,-1,1,2016-09-01
12995,2930,4,4,0,0,0,0,0,2,0,...,2,2016-09-01,"Health and Wellness, Spa",1787.0,2016-09-01,2018-09-01,NaT,0,12,2017-09-01
13738,2930,4,4,0,0,0,0,0,0,0,...,3,2016-10-01,"Health and Wellness, Spa",1787.0,2016-09-01,2018-09-01,NaT,0,11,2017-09-01
14516,2930,4,4,1,0,0,0,1,5,0,...,5,2016-11-01,"Health and Wellness, Spa",1787.0,2016-09-01,2018-09-01,NaT,0,10,2017-09-01
15271,2930,4,4,0,0,0,0,0,0,0,...,2,2016-12-01,"Health and Wellness, Spa",1787.0,2016-09-01,2018-09-01,NaT,0,9,2017-09-01


#### - User 2375

In [37]:
df_master_clean.loc[df_master_clean["user_ID"] == 2375].sort_values("palms_date").head(5)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
28955,2375,9,1,0,0,0,0,0,0,0,...,0,2018-05-01,"Speaking, Speaker",1366.0,2018-06-01,2019-07-01,2019-05-09,-1,2,2018-07-01
29791,2375,9,4,0,0,0,0,0,0,0,...,0,2018-06-01,"Speaking, Speaker",1366.0,2018-06-01,2019-07-01,2019-05-09,0,1,2018-07-01
30619,2375,9,4,0,0,0,0,1,3,1,...,14,2018-07-01,"Speaking, Speaker",1366.0,2018-06-01,2019-07-01,2019-05-09,0,12,2019-07-01
31427,2375,9,5,0,0,0,0,0,1,1,...,2,2018-08-01,"Speaking, Speaker",1366.0,2018-06-01,2019-07-01,2019-05-09,0,11,2019-07-01
32241,2375,9,3,0,0,0,1,3,2,0,...,25,2018-09-01,"Speaking, Speaker",1366.0,2018-06-01,2019-07-01,2019-05-09,0,10,2019-07-01


**Conclusion**

Having a look at randomly sampled 10 members it is clear that there is a pattern for members with two or less records with negative ```year_of_membership```. The one month always occurs before the actual ```join_date``` which means that the member is allowed to start attending meetings before the join date. Those records with negative membership will be dropped later.

### Members with at least two negative months
The data for each member can be cross-checked and  the incorrect records cleaned based on the ```df_database``` dataset.

In [38]:
df_negative.loc[df_negative["negative_months"] > 2].sort_values("negative_months", ascending=False)

Unnamed: 0_level_0,negative_months
user_ID,Unnamed: 1_level_1
3024,33
2901,33
2092,32
305,29
1848,29
1016,27
39,25
2523,25
1715,24
1765,21


In [39]:
df_negative.loc[df_negative["negative_months"] > 2].sort_values("negative_months", ascending=False).shape

(35, 1)

### Cleaning single users:
Cleaning those records requires some expertise about the data - namely if a member dropped and rejoined sometime later or perhaps transferred chapters. The 35 records will be detached from the ```df_master_clean``` dataframe and later rejoined after the calculations are re-done.

My methodology per each user consists of the following steps:
1. Check users PALMS and ```df_database``` data and see if there are any discrepancies in: ```chapter_ID```, ```join_date``` or ```renewal_date```
2. Pull up ```df_database``` to see all user ```join_date```:
 - If there are multiple and the member transferred chapters - impute all ```join_date```s in the PALMS data with the oldest date from ```df_database```
 - If there are multiple and the member left BNI and rejoined sometime later - attempt to calculate the cumulative membership year and separate those PALMS records from the original for later merging.
3. Pull up ```df_database``` to see all user ```drop_date```:
 - If there are none - assume that the member transferred chapters and never left BNI
 - Check the most recent PALMS date which contains the members' data.
 
### Seperate records to be cleaned

In [40]:
index_list = df_negative.loc[df_negative["negative_months"] > 2].index.tolist()

df_workspace = df_master_clean.loc[df_master_clean["user_ID"].isin(index_list)].copy()
df_temp = df_master_clean.loc[~df_master_clean["user_ID"].isin(index_list)].copy()
df_workspace.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
6,2347,1,3,1,0,0,0,0,1,0,...,0,2015-01-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-5,5,2015-06-01
11,2673,1,4,0,0,0,0,0,4,0,...,0,2015-01-01,"Real Estate, Real Estate Sales Representative ...",,2015-12-01,2018-05-01,2018-01-18,-1,4,2015-05-01
24,2838,1,4,0,0,0,0,0,10,0,...,0,2015-01-01,"Unknown, BNIW_Null",,2015-12-01,2016-12-01,2017-01-04,-1,11,2015-12-01
130,2092,10,4,0,0,0,0,1,0,0,...,0,2015-01-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-5,10,2015-11-01
282,2387,17,4,0,0,0,0,1,3,0,...,0,2015-01-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-2,5,2015-06-01


#### - User 3024

In [41]:
df_workspace.loc[df_workspace["user_ID"] == 3024].sample(5, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
13300,3024,17,3,0,0,0,1,3,0,0,...,11,2016-09-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,-2,6,2017-03-01
55603,3024,9,4,0,0,0,0,0,3,0,...,23,2021-01-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,2,2,2021-03-01
19330,3024,17,5,0,0,0,0,0,3,1,...,10,2017-05-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,-2,10,2018-03-01
41007,3024,9,4,0,0,0,0,0,0,0,...,6,2019-07-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,0,8,2020-03-01
24338,3024,17,5,0,0,0,0,4,1,1,...,11,2017-11-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,-1,4,2018-03-01


This user has a varying ```chapter_ID``` and ```drop_date```. Having a look at the ```df_database``` it seems as though the member transferred chapters from **17** to **9**.

In [42]:
df_workspace.loc[(df_workspace["user_ID"] == 3024) & (df_workspace["chapter_ID"] == 17)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
30892,3024,17,3,0,1,0,0,0,0,0,...,24,2018-07-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,-1,8,2019-03-01
31707,3024,17,4,0,1,0,0,0,7,2,...,41,2018-08-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,-1,7,2019-03-01
32535,3024,17,0,2,0,0,0,0,0,0,...,0,2018-09-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,0,6,2019-03-01


In [43]:
df_workspace.loc[(df_workspace["user_ID"] == 3024) & (df_workspace["chapter_ID"] == 9)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32285,3024,9,4,0,0,0,0,0,3,1,...,16,2018-09-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,0,6,2019-03-01
33138,3024,9,4,0,0,0,0,1,2,0,...,18,2018-10-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,0,5,2019-03-01
33973,3024,9,5,0,0,0,0,1,2,1,...,17,2018-11-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2018-09-01,2022-03-01,2018-09-06,0,4,2019-03-01


Looks like in 2018-09-01 the user transfered chapters and started being active in chapter 9. Lets drop users last month in chapter 17. 

In [44]:
df_workspace.drop(32535, axis=0, inplace=True)

In [45]:
df_database.loc[df_database["user_ID"] == 3024, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
438,2018-09-01
2077,2015-12-01


His ```join_date``` can be corrected to 2015-12-01 according to the ```df_database```.

In [46]:
df_workspace.loc[df_workspace["user_ID"] == 3024, "join_date"] = pd.Timestamp("2015-12-01")

In [47]:
df_dropped.loc[df_dropped["user_ID"] == 3024, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
728,2018-09-06


In [48]:
df_workspace.loc[df_workspace["user_ID"] == 3024, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [49]:
df_workspace.loc[(df_workspace["user_ID"] == 3024) & (df_workspace["chapter_ID"] == 9)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
61382,3024,9,4,0,0,0,0,1,3,1,...,12,2021-08-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2015-12-01,2022-03-01,2018-09-06,2,7,2022-03-01
62216,3024,9,4,0,0,0,0,0,4,0,...,13,2021-09-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2015-12-01,2022-03-01,2018-09-06,3,6,2022-03-01
63022,3024,9,3,0,0,0,0,1,2,0,...,10,2021-10-01,"Finance & Insurance, Credit Card/Merchant Serv...",,2015-12-01,2022-03-01,2018-09-06,3,5,2022-03-01


In [50]:
df_workspace.loc[(df_workspace["user_ID"] == 3024) & (df_workspace["chapter_ID"] == 9), "drop_date"] = pd.NaT

#### - User 2901

In [51]:
df_workspace.loc[df_workspace["user_ID"] == 2901].sample(5, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
21506,2901,4,3,1,0,0,1,1,3,0,...,7,2017-08-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,-2,7,2018-03-01
63166,2901,12,4,0,0,0,0,0,4,0,...,7,2021-10-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,2,5,2022-03-01
32151,2901,4,3,0,0,0,1,1,2,0,...,9,2018-09-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,-1,6,2019-03-01
14520,2901,4,5,0,0,0,0,0,4,0,...,18,2016-11-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,-3,4,2017-03-01
34961,2901,12,2,0,0,0,0,0,2,0,...,3,2018-12-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,0,3,2019-03-01


Similar case to user 3024 - group transfer from **4** to **12**.

In [52]:
df_workspace.loc[(df_workspace["user_ID"] == 2901) & (df_workspace["chapter_ID"] == 4)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
33003,2901,4,4,0,0,0,0,0,4,0,...,11,2018-10-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,-1,5,2019-03-01
33853,2901,4,4,0,0,0,0,0,2,0,...,5,2018-11-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,-1,4,2019-03-01
34675,2901,4,2,0,0,0,0,0,3,0,...,5,2018-12-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,0,3,2019-03-01


In [53]:
df_workspace.loc[(df_workspace["user_ID"] == 2901) & (df_workspace["chapter_ID"] == 12)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
34961,2901,12,2,0,0,0,0,0,2,0,...,3,2018-12-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,0,3,2019-03-01
35817,2901,12,4,0,0,0,0,1,8,3,...,12,2019-01-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,0,2,2019-03-01
36705,2901,12,4,0,0,0,0,1,8,5,...,9,2019-02-01,"Finance & Insurance, Financial Investments",1829.0,2018-12-01,2022-03-01,NaT,0,1,2019-03-01


In [54]:
df_workspace.loc[[34961], df_workspace.columns[2:15]] = df_workspace.loc[34961, df_workspace.columns[2:15]].values +\
    df_workspace.loc[34675, df_workspace.columns[2:15]].values
df_workspace.drop(34675, axis=0, inplace=True)

In [55]:
df_database.loc[df_database["user_ID"] == 2901, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
739,2018-12-01
2867,2016-03-01


In [56]:
df_workspace.loc[df_workspace["user_ID"] == 2901, "join_date"] = pd.Timestamp("2016-03-01")

In [57]:
df_dropped.loc[df_dropped["user_ID"] == 2901, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date


In [58]:
df_workspace.loc[df_workspace["user_ID"] == 2901, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [59]:
df_workspace.loc[df_workspace["user_ID"] == 2901, "drop_date"] = pd.NaT

#### - User 2092 

In [60]:
df_workspace.loc[df_workspace["user_ID"] == 2092].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
8029,2092,10,3,0,0,0,1,0,1,0,...,0,2016-02-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-4,9,2016-11-01
10919,2092,10,5,0,0,0,0,0,3,1,...,0,2016-06-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-4,5,2016-11-01
45875,2092,9,4,0,0,0,0,0,1,1,...,0,2020-01-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,0,10,2020-11-01
19973,2092,10,4,0,0,0,1,0,28,0,...,0,2017-06-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-3,5,2017-11-01
19134,2092,10,4,0,0,0,0,0,1,0,...,1,2017-05-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-3,6,2017-11-01
20815,2092,10,4,0,0,0,0,0,2,0,...,1,2017-07-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-2,4,2017-11-01
15427,2092,10,1,1,1,0,0,0,3,0,...,3,2016-12-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-3,11,2017-11-01
4822,2092,10,3,0,1,0,0,2,1,1,...,5,2015-09-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-4,2,2015-11-01


In [61]:
df_workspace.loc[(df_workspace["user_ID"] == 2092) & (df_workspace["chapter_ID"] == 10)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
19973,2092,10,4,0,0,0,1,0,28,0,...,0,2017-06-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-3,5,2017-11-01
20815,2092,10,4,0,0,0,0,0,2,0,...,1,2017-07-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-2,4,2017-11-01
21639,2092,10,1,1,0,0,0,0,1,4,...,0,2017-08-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,-2,3,2017-11-01


In [62]:
df_workspace.loc[(df_workspace["user_ID"] == 2092) & (df_workspace["chapter_ID"] == 9)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
41852,2092,9,3,0,0,0,0,0,13,0,...,0,2019-08-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,0,3,2019-11-01
42684,2092,9,4,0,0,0,0,0,10,3,...,0,2019-09-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,0,2,2019-11-01
43515,2092,9,5,0,0,0,0,1,5,0,...,0,2019-10-01,"Coach, Business Coach",3024.0,2019-07-01,2020-11-01,2020-07-02,0,1,2019-11-01


This is a different case then previously as the user left BNI in 2017-08-01 and re-joined it a few years later in 2019-08-01. In such case there is no need to add rows, but perhaps the ```join_date``` and ```drop_date``` needs to be adjusted.

Additionally a dictionary containing information about how many months should be subtracted from a specific member and after what date should be created to account for the years during which the member was inactive.

In [146]:
df_adjust_records = pd.DataFrame(columns=["membership_months_to_sub",
                                          "membership_months_sub_date_start",
                                          "renewal_months_to_add",
                                          "renewal_months_add_palms_date_end"])
df_adjust_records.loc[2092] = [-24, "2019-08-01", 8, "2017-08-01"]
df_adjust_records

Unnamed: 0,membership_months_to_sub,membership_months_sub_date_start,renewal_months_to_add,renewal_months_add_palms_date_end
2092,-24,2019-08-01,8,2017-08-01


In [64]:
(pd.Timestamp("2020-07-01") - pd.Timestamp("2019-11-01")) / np.timedelta64(1, 'M')

7.983736832378488

In [65]:
df_database.loc[df_database["user_ID"] == 2092, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
447,2019-07-01
1487,2011-02-01


In [66]:
df_workspace.loc[df_workspace["user_ID"] == 2092, "join_date"] = pd.Timestamp("2011-02-01")

In [67]:
df_dropped.loc[df_dropped["user_ID"] == 2092, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
238,2020-07-02
969,2017-08-01


In [68]:
df_workspace.loc[df_workspace["user_ID"] == 2092, "palms_date"].max()

Timestamp('2020-09-01 00:00:00')

#### - User 305 

In [69]:
df_workspace.loc[df_workspace["user_ID"] ==  305].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
52033,305,21,4,0,0,0,0,1,0,1,...,3,2020-08-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-2,1,2020-09-01
55167,305,21,3,0,0,0,0,4,16,0,...,4,2020-12-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-1,9,2021-09-01
47070,305,21,3,1,0,0,0,0,4,1,...,0,2020-02-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-2,7,2020-09-01
51231,305,21,3,1,0,0,0,0,1,1,...,0,2020-07-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-2,2,2020-09-01
60131,305,21,3,0,0,1,0,0,0,1,...,0,2021-06-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-1,3,2021-09-01
42264,305,21,4,0,0,0,0,1,0,1,...,0,2019-08-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-3,1,2019-09-01
59314,305,21,2,2,0,0,0,3,2,3,...,8,2021-05-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-1,4,2021-09-01
39645,305,21,5,0,0,0,0,3,6,3,...,17,2019-05-01,"Real Estate Services, Cleaning Service",,2021-09-01,2022-09-01,2021-07-07,-3,4,2019-09-01


No indication of chapter change. Must be something different.

In [70]:
df_database.loc[df_database["user_ID"] == 305, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2470,2021-09-01
3103,2019-04-01


In [71]:
df_workspace.loc[df_workspace["user_ID"] == 305, "palms_date"]

38757   2019-04-01
39645   2019-05-01
40521   2019-06-01
41418   2019-07-01
42264   2019-08-01
43087   2019-09-01
43910   2019-10-01
44690   2019-11-01
45466   2019-12-01
46254   2020-01-01
47070   2020-02-01
47949   2020-03-01
48792   2020-04-01
49625   2020-05-01
50426   2020-06-01
51231   2020-07-01
52033   2020-08-01
52827   2020-09-01
53620   2020-10-01
54411   2020-11-01
55167   2020-12-01
56013   2021-01-01
56842   2021-02-01
57672   2021-03-01
58494   2021-04-01
59314   2021-05-01
60131   2021-06-01
60974   2021-07-01
61703   2021-08-01
62530   2021-09-01
63332   2021-10-01
Name: palms_date, dtype: datetime64[ns]

Seems like the user was dropped and shortly later re-instated. Guessing it was some human error and will adjust ```join_date```.

In [72]:
df_workspace.loc[df_workspace["user_ID"] == 305, "join_date"] = pd.Timestamp("2019-04-01")

In [73]:
df_dropped.loc[df_dropped["user_ID"] == 305, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
73,2021-07-07


#### - User 1848 

In [74]:
df_workspace.loc[df_workspace["user_ID"] == 1848].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
47758,1848,15,4,0,0,0,0,1,8,1,...,2,2020-03-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-1,3,2020-06-01
58935,1848,10,4,0,0,0,0,1,4,2,...,4,2021-05-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,0,1,2021-06-01
58121,1848,10,5,0,0,0,0,1,2,5,...,5,2021-04-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,0,2,2021-06-01
37711,1848,15,4,0,0,0,0,0,3,6,...,6,2019-03-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-2,3,2019-06-01
40354,1848,15,4,0,0,0,0,0,2,2,...,3,2019-06-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-2,12,2020-06-01
35916,1848,15,4,0,0,0,0,1,2,3,...,6,2019-01-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-2,5,2019-06-01
54081,1848,10,4,0,0,0,0,3,1,5,...,1,2020-11-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,0,7,2021-06-01
41247,1848,15,5,0,0,0,0,2,1,2,...,6,2019-07-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-2,11,2020-06-01


In [75]:
df_workspace.loc[(df_workspace["user_ID"] == 1848) & (df_workspace["chapter_ID"] == 15)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
51046,1848,15,4,0,0,0,0,2,1,1,...,2,2020-07-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-1,11,2021-06-01
51852,1848,15,4,0,0,0,0,0,0,0,...,0,2020-08-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-1,10,2021-06-01
52651,1848,15,4,0,0,0,0,0,0,0,...,0,2020-09-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,-1,9,2021-06-01


In [76]:
df_workspace.loc[(df_workspace["user_ID"] == 1848) & (df_workspace["chapter_ID"] == 10)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
53294,1848,10,5,0,0,0,0,3,1,4,...,1,2020-10-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,0,8,2021-06-01
54081,1848,10,4,0,0,0,0,3,1,5,...,1,2020-11-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,0,7,2021-06-01
54846,1848,10,5,0,0,0,0,0,3,3,...,1,2020-12-01,"Health & Wellness, Chiropractor",2180.0,2020-10-01,2022-06-01,2020-09-01,0,6,2021-06-01


In [77]:
df_database.loc[df_database["user_ID"] == 1848, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1635,2020-10-01
2789,2018-05-01


In [78]:
df_workspace.loc[df_workspace["user_ID"] == 1848, "join_date"] = pd.Timestamp("2018-05-01")

In [79]:
df_dropped.loc[df_dropped["user_ID"] == 1848, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
234,2020-09-01


In [80]:
df_workspace.loc[df_workspace["user_ID"] == 1848, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [81]:
df_workspace.loc[df_workspace["user_ID"] == 1848, "drop_date"] = pd.NaT

#### - User 1016 

In [82]:
df_workspace.loc[df_workspace["user_ID"] == 1016].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36955,1016,21,4,0,0,0,0,1,2,1,...,3,2019-02-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,-1,12,2020-02-01
33538,1016,21,2,1,0,0,1,1,2,1,...,1,2018-10-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,-2,4,2019-02-01
32691,1016,21,4,0,0,0,0,0,2,0,...,2,2018-09-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,-2,5,2019-02-01
56873,1016,23,4,0,0,0,0,1,3,1,...,8,2021-02-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,1,12,2022-02-01
46293,1016,23,4,0,0,0,0,1,3,0,...,3,2020-01-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,0,1,2020-02-01
59344,1016,23,4,0,0,0,0,1,2,0,...,6,2021-05-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,1,9,2022-02-01
49660,1016,23,4,0,0,0,0,0,1,0,...,1,2020-05-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,0,9,2021-02-01
28545,1016,21,3,0,1,0,0,0,0,3,...,2,2018-04-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,-2,10,2019-02-01


In [83]:
df_workspace.loc[(df_workspace["user_ID"] == 1016) & (df_workspace["chapter_ID"] == 21)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
43070,1016,21,0,0,2,0,2,1,1,0,...,2,2019-09-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,-1,5,2020-02-01
43893,1016,21,4,0,0,0,1,1,2,0,...,4,2019-10-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,-1,4,2020-02-01
44673,1016,21,0,1,0,0,0,0,1,1,...,0,2019-11-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,0,3,2020-02-01


In [84]:
df_workspace.loc[(df_workspace["user_ID"] == 1016) & (df_workspace["chapter_ID"] == 23)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
44726,1016,23,3,0,0,0,0,1,0,0,...,3,2019-11-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,0,3,2020-02-01
45503,1016,23,3,0,0,0,0,0,1,0,...,5,2019-12-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,0,2,2020-02-01
46293,1016,23,4,0,0,0,0,1,3,0,...,3,2020-01-01,"Finance & Insurance, Life and Disability Insur...",,2019-11-01,2022-02-01,2019-11-01,0,1,2020-02-01


In [85]:
df_workspace.loc[[44726], df_workspace.columns[2:15]] = df_workspace.loc[44726, df_workspace.columns[2:15]].values +\
    df_workspace.loc[44673, df_workspace.columns[2:15]].values
df_workspace.drop(44673, axis=0, inplace=True)

In [86]:
df_database.loc[df_database["user_ID"] == 1016, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
196,2019-11-01
3045,2017-08-01


In [87]:
df_workspace.loc[df_workspace["user_ID"] == 1016, "join_date"] = pd.Timestamp("2017-08-01")

In [88]:
df_dropped.loc[df_dropped["user_ID"] == 1016, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
419,2019-11-01


In [89]:
df_workspace.loc[df_workspace["user_ID"] == 1016, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [90]:
df_workspace.loc[df_workspace["user_ID"] == 1016, "drop_date"] = pd.NaT

#### - User 39 

In [91]:
df_workspace.loc[df_workspace["user_ID"] == 39].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
44767,39,24,3,1,0,0,0,0,0,8,...,2,2019-11-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-1,7,2020-06-01
52693,39,18,2,0,0,3,0,1,1,0,...,1,2020-09-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,9,2021-06-01
49686,39,24,4,0,0,0,0,2,1,8,...,0,2020-05-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-1,1,2020-06-01
46332,39,24,3,1,0,0,0,1,1,5,...,3,2020-01-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-1,5,2020-06-01
48010,39,24,2,1,0,0,0,0,0,3,...,0,2020-03-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-1,3,2020-06-01
53487,39,18,3,1,0,0,0,2,1,1,...,1,2020-10-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,8,2021-06-01
51896,39,18,2,2,0,0,0,0,1,4,...,1,2020-08-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,10,2021-06-01
36163,39,24,4,0,0,0,0,1,2,1,...,3,2019-01-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-2,5,2019-06-01


In [92]:
df_workspace.loc[(df_workspace["user_ID"] == 39) & (df_workspace["chapter_ID"] == 24)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
48852,39,24,3,1,1,0,0,0,0,6,...,0,2020-04-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-1,2,2020-06-01
49686,39,24,4,0,0,0,0,2,1,8,...,0,2020-05-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,-1,1,2020-06-01
50487,39,24,1,1,0,0,0,0,0,0,...,0,2020-06-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,12,2021-06-01


In [93]:
df_workspace.loc[(df_workspace["user_ID"] == 39) & (df_workspace["chapter_ID"] == 18)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
50289,39,18,2,0,0,1,0,1,0,1,...,0,2020-06-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,12,2021-06-01
51095,39,18,3,1,0,0,0,0,2,5,...,0,2020-07-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,11,2021-06-01
51896,39,18,2,2,0,0,0,0,1,4,...,1,2020-08-01,"Food & Beverage, Caterer",,2020-06-01,2021-06-01,2021-04-21,0,10,2021-06-01


In [94]:
df_workspace.loc[[50289], df_workspace.columns[2:15]] = df_workspace.loc[50289, df_workspace.columns[2:15]].values +\
    df_workspace.loc[50487, df_workspace.columns[2:15]].values
df_workspace.drop(50487, axis=0, inplace=True)

In [95]:
df_database.loc[df_database["user_ID"] == 39, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2462,2020-06-01
3003,2018-06-01


In [96]:
df_workspace.loc[df_workspace["user_ID"] == 39, "join_date"] = pd.Timestamp("2018-06-01")

In [97]:
df_dropped.loc[df_dropped["user_ID"] == 39, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
103,2021-04-21
294,2020-06-09


In [98]:
df_workspace.loc[df_workspace["user_ID"] == 39, "palms_date"].max()

Timestamp('2021-05-01 00:00:00')

#### - User 2523 

In [99]:
df_workspace.loc[df_workspace["user_ID"] == 2523].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36967,2523,21,2,2,0,0,0,2,1,0,...,2,2019-02-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-1,12,2020-02-01
40516,2523,21,3,1,0,0,0,3,1,0,...,6,2019-06-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-1,8,2020-02-01
56878,2523,23,4,0,0,0,0,0,2,1,...,9,2021-02-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,1,12,2022-02-01
50469,2523,23,3,1,0,0,0,0,0,0,...,4,2020-06-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,0,8,2021-02-01
49668,2523,23,3,0,0,1,0,0,0,1,...,4,2020-05-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,0,9,2021-02-01
51271,2523,23,4,1,0,0,0,0,1,0,...,5,2020-07-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,0,7,2021-02-01
45462,2523,21,3,0,0,0,0,3,1,0,...,6,2019-12-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-1,2,2020-02-01
32710,2523,21,3,1,0,0,0,1,2,0,...,4,2018-09-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-2,5,2019-02-01


In [100]:
df_workspace.loc[(df_workspace["user_ID"] == 2523) & (df_workspace["chapter_ID"] == 21)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
44686,2523,21,2,1,0,0,1,1,1,0,...,2,2019-11-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-1,3,2020-02-01
45462,2523,21,3,0,0,0,0,3,1,0,...,6,2019-12-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-1,2,2020-02-01
46250,2523,21,2,0,0,0,0,0,0,0,...,0,2020-01-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,-1,1,2020-02-01


In [101]:
df_workspace.loc[(df_workspace["user_ID"] == 2523) & (df_workspace["chapter_ID"] == 23)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
47128,2523,23,3,0,0,0,0,0,0,0,...,0,2020-02-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,0,12,2021-02-01
47991,2523,23,3,0,0,0,0,1,1,1,...,5,2020-03-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,0,11,2021-02-01
48834,2523,23,4,1,0,0,0,0,1,1,...,6,2020-04-01,"Architecture & Engineering, Interior Architecture",1016.0,2020-02-01,2022-02-01,2020-01-01,0,10,2021-02-01


In [102]:
df_database.loc[df_database["user_ID"] == 2523, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
202,2020-02-01
3078,2018-01-01


In [103]:
df_workspace.loc[df_workspace["user_ID"] == 2523, "join_date"] = pd.Timestamp("2018-01-01")

In [104]:
df_dropped.loc[df_dropped["user_ID"] == 2523, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
363,2020-01-01


In [105]:
df_workspace.loc[df_workspace["user_ID"] == 2523, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [106]:
df_workspace.loc[df_workspace["user_ID"] == 2523, "drop_date"] = pd.NaT

#### - User 1715 

In [107]:
df_workspace.loc[df_workspace["user_ID"] == 1715].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
57620,1715,19,4,0,1,0,0,1,5,6,...,6,2021-03-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-1,7,2021-10-01
56793,1715,19,4,0,0,0,0,4,6,3,...,4,2021-02-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-1,8,2021-10-01
55963,1715,19,3,0,0,0,1,2,9,2,...,5,2021-01-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-1,9,2021-10-01
46205,1715,19,4,0,0,0,0,4,4,5,...,34,2020-01-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-2,9,2020-10-01
55117,1715,19,2,0,1,0,0,7,3,1,...,3,2020-12-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-1,10,2021-10-01
60076,1715,19,3,2,0,0,0,2,8,2,...,5,2021-06-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-1,4,2021-10-01
60918,1715,19,3,1,0,0,0,0,6,1,...,4,2021-07-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-1,3,2021-10-01
47020,1715,19,4,0,0,0,0,4,3,3,...,20,2020-02-01,"Construction, Plumbing",,2021-10-01,2022-10-01,2021-09-23,-2,8,2020-10-01


In [108]:
df_database.loc[df_database["user_ID"] == 1715, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1228,2021-10-01
2618,2019-10-01


In [109]:
df_workspace.loc[df_workspace["user_ID"] == 1715, "palms_date"]

43861   2019-10-01
44640   2019-11-01
45417   2019-12-01
46205   2020-01-01
47020   2020-02-01
47895   2020-03-01
48736   2020-04-01
49570   2020-05-01
50373   2020-06-01
51180   2020-07-01
51980   2020-08-01
52774   2020-09-01
53567   2020-10-01
54361   2020-11-01
55117   2020-12-01
55963   2021-01-01
56793   2021-02-01
57620   2021-03-01
58441   2021-04-01
59259   2021-05-01
60076   2021-06-01
60918   2021-07-01
61764   2021-08-01
62577   2021-09-01
63576   2021-10-01
Name: palms_date, dtype: datetime64[ns]

In [110]:
df_workspace.loc[df_workspace["user_ID"] == 1715, "join_date"] = pd.Timestamp("2019-10-01")

In [111]:
df_dropped.loc[df_dropped["user_ID"] == 1715, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
30,2021-09-23


#### - User 1765 

In [112]:
df_workspace.loc[df_workspace["user_ID"] == 1765].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
57801,1765,26,4,0,0,0,0,0,3,2,...,5,2021-03-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,1,12,2022-03-01
56161,1765,26,4,0,0,0,0,1,4,0,...,4,2021-01-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,1,2,2021-03-01
47270,1765,26,4,0,0,0,0,1,5,1,...,1,2020-02-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,1,2020-03-01
50585,1765,26,4,0,0,0,0,1,3,0,...,0,2020-06-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,9,2021-03-01
37643,1765,14,2,0,1,0,1,1,4,0,...,6,2019-03-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,-1,12,2020-03-01
51382,1765,26,5,0,0,0,0,1,0,0,...,4,2020-07-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,8,2021-03-01
45665,1765,26,3,0,0,0,0,4,0,0,...,4,2019-12-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,3,2020-03-01
32448,1765,14,4,0,0,0,0,1,3,0,...,19,2018-09-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,-2,6,2019-03-01


In [149]:
df_workspace.loc[(df_workspace["user_ID"] == 1765) & (df_workspace["chapter_ID"] == 14)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
25881,1765,14,2,0,0,0,0,8,3,1,...,0,2018-01-01,"Legal & Accounting, Accounting Services",7.0,2018-02-01,2022-03-01,2019-09-19,-2,2,2018-03-01
26723,1765,14,3,0,1,0,0,1,1,1,...,6,2018-02-01,"Legal & Accounting, Accounting Services",7.0,2018-02-01,2022-03-01,2019-09-19,-2,1,2018-03-01
27536,1765,14,4,0,0,0,0,1,4,1,...,11,2018-03-01,"Legal & Accounting, Accounting Services",7.0,2018-02-01,2022-03-01,2019-09-19,-2,12,2019-03-01


In [114]:
df_workspace.loc[(df_workspace["user_ID"] == 1765) & (df_workspace["chapter_ID"] == 26)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
44894,1765,26,3,0,0,0,1,3,1,2,...,7,2019-11-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,4,2020-03-01
45665,1765,26,3,0,0,0,0,4,0,0,...,4,2019-12-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,3,2020-03-01
46453,1765,26,4,0,0,0,0,3,1,0,...,3,2020-01-01,"Legal & Accounting, Accounting Services",7.0,2019-11-01,2022-03-01,2019-09-19,0,2,2020-03-01


In [150]:
df_adjust_records.loc[1765] = [-1, "2019-11-01", 11, "2017-08-01"]
df_adjust_records

Unnamed: 0,membership_months_to_sub,membership_months_sub_date_start,renewal_months_to_add,renewal_months_add_palms_date_end
2092,-24,2019-08-01,8,2017-08-01
2205,16,2021-03-01,8,2018-04-01
1765,-1,2019-11-01,11,2017-08-01


In [115]:
df_database.loc[df_database["user_ID"] == 1765, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1204,2019-11-01
2959,2017-03-01
3162,2018-02-01


In [117]:
df_workspace.loc[df_workspace["user_ID"] == 1765, "palms_date"]

25881   2018-01-01
26723   2018-02-01
27536   2018-03-01
28346   2018-04-01
29158   2018-05-01
29992   2018-06-01
30817   2018-07-01
31621   2018-08-01
32448   2018-09-01
33300   2018-10-01
34130   2018-11-01
34995   2018-12-01
35851   2019-01-01
36740   2019-02-01
37643   2019-03-01
38526   2019-04-01
39417   2019-05-01
40291   2019-06-01
41180   2019-07-01
42048   2019-08-01
42869   2019-09-01
44894   2019-11-01
45665   2019-12-01
46453   2020-01-01
47270   2020-02-01
48107   2020-03-01
48951   2020-04-01
49785   2020-05-01
50585   2020-06-01
51382   2020-07-01
52184   2020-08-01
52978   2020-09-01
53770   2020-10-01
54538   2020-11-01
55336   2020-12-01
56161   2021-01-01
56975   2021-02-01
57801   2021-03-01
58619   2021-04-01
59440   2021-05-01
60255   2021-06-01
61098   2021-07-01
61940   2021-08-01
62751   2021-09-01
63567   2021-10-01
Name: palms_date, dtype: datetime64[ns]

In [118]:
df_workspace.loc[df_workspace["user_ID"] == 1765, "join_date"] = pd.Timestamp("2018-02-01")

In [119]:
df_dropped.loc[df_dropped["user_ID"] == 1765, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
471,2019-09-19


In [118]:
df_workspace.loc[df_workspace["user_ID"] == 1765, "drop_date"] = pd.NaT

#### - User 2205 

In [120]:
df_workspace.loc[df_workspace["user_ID"] == 2205].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
43739,2205,15,4,1,0,0,0,4,7,0,...,1,2019-10-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-2,10,2020-08-01
31668,2205,15,4,0,0,0,0,0,2,0,...,10,2018-08-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-3,12,2019-08-01
59631,2205,6,5,0,0,0,0,1,7,1,...,4,2021-06-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,0,2,2021-08-01
60459,2205,6,3,0,0,0,1,0,3,1,...,2,2021-07-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,0,1,2021-08-01
30862,2205,15,4,0,0,0,1,0,4,0,...,4,2018-07-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-3,1,2018-08-01
62933,2205,6,4,0,0,0,0,2,1,1,...,3,2021-10-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,0,10,2022-08-01
40352,2205,15,2,1,0,0,1,0,8,0,...,3,2019-06-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-2,2,2019-08-01
41245,2205,15,3,0,0,0,2,1,4,0,...,4,2019-07-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-2,1,2019-08-01


In [128]:
df_workspace.loc[(df_workspace["user_ID"] == 2205) & (df_workspace["chapter_ID"] == 15)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
42911,2205,15,3,1,0,0,0,2,1,0,...,3,2019-09-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-2,11,2020-08-01
43739,2205,15,4,1,0,0,0,4,7,0,...,1,2019-10-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-2,10,2020-08-01
44512,2205,15,3,0,0,0,0,0,1,0,...,0,2019-11-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,-2,9,2020-08-01


In [122]:
df_workspace.loc[(df_workspace["user_ID"] == 2205) & (df_workspace["chapter_ID"] == 6)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
57172,2205,6,3,0,0,0,0,4,8,1,...,3,2021-03-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,0,5,2021-08-01
57997,2205,6,3,1,0,0,0,3,4,0,...,2,2021-04-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,0,4,2021-08-01
58816,2205,6,3,0,0,0,1,4,6,0,...,3,2021-05-01,"Finance & Insurance, Residential Mortgages",2856.0,2021-03-01,2022-08-01,2019-11-14,0,3,2021-08-01


In [151]:
df_adjust_records.loc[2205] = [-16, "2021-03-01", 8, "2018-04-01"]

In [152]:
df_database.loc[df_database["user_ID"] == 2205, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1941,2021-03-01
2788,2018-04-01


In [153]:
df_workspace.loc[df_workspace["user_ID"] == 2205, "join_date"] = pd.Timestamp("2018-04-01")

In [155]:
df_dropped.loc[df_dropped["user_ID"] == 2205, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
413,2019-11-14


In [154]:
df_workspace.loc[df_workspace["user_ID"] == 2205, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [156]:
df_workspace.loc[df_workspace["user_ID"] == 2205, "drop_date"] =  pd.NaT

#### - User 292 

In [157]:
df_workspace.loc[df_workspace["user_ID"] == 292].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36747,292,14,4,0,0,0,0,0,5,2,...,5,2019-02-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,-1,1,2019-03-01
40299,292,14,3,0,0,0,1,1,1,1,...,0,2019-06-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,-1,9,2020-03-01
56483,292,10,3,0,1,0,0,0,2,2,...,4,2021-02-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,1,1,2021-03-01
50099,292,10,3,0,1,0,0,2,2,3,...,4,2020-06-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,9,2021-03-01
49289,292,10,4,0,0,0,0,2,1,3,...,0,2020-05-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,10,2021-03-01
50900,292,10,2,1,1,0,1,2,1,2,...,1,2020-07-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,8,2021-03-01
45163,292,10,2,1,0,0,0,1,6,1,...,1,2019-12-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,3,2020-03-01
32455,292,14,4,0,0,0,0,1,3,1,...,0,2018-09-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,-1,6,2019-03-01


In [158]:
df_workspace.loc[(df_workspace["user_ID"] == 292) & (df_workspace["chapter_ID"] == 14)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
40299,292,14,3,0,0,0,1,1,1,1,...,0,2019-06-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,-1,9,2020-03-01
41188,292,14,4,0,0,0,1,0,0,1,...,1,2019-07-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,-1,8,2020-03-01
42055,292,14,3,0,0,0,0,2,0,0,...,0,2019-08-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,-1,7,2020-03-01


In [159]:
df_workspace.loc[(df_workspace["user_ID"] == 292) & (df_workspace["chapter_ID"] == 10)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
42756,292,10,3,0,0,0,1,0,1,0,...,6,2019-09-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,6,2020-03-01
43586,292,10,4,0,0,0,1,0,3,5,...,2,2019-10-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,5,2020-03-01
44382,292,10,4,0,0,0,0,0,4,2,...,5,2019-11-01,"Finance & Insurance, Property & Casualty Insur...",598.0,2019-09-01,2022-03-01,2019-08-14,0,4,2020-03-01


In [160]:
df_database.loc[df_database["user_ID"] == 292, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1620,2019-09-01
3165,2018-02-01


In [163]:
df_workspace.loc[df_workspace["user_ID"] == 292, "join_date"] = pd.Timestamp("2018-02-01")

In [164]:
df_workspace.loc[df_workspace["user_ID"] == 292, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [165]:
df_workspace.loc[df_workspace["user_ID"] == 292, "drop_date"] = pd.NaT

#### - User 2827 

In [166]:
df_workspace.loc[df_workspace["user_ID"] == 2827].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36229,2827,25,3,0,0,0,0,1,5,4,...,4,2019-01-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,1,2019-02-01
38895,2827,25,4,0,0,0,0,3,3,2,...,14,2019-04-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,0,10,2020-02-01
30220,2827,21,4,0,0,0,0,0,4,0,...,4,2018-06-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,8,2019-02-01
35234,2827,21,3,0,0,0,0,0,3,0,...,3,2018-12-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,2,2019-02-01
40662,2827,25,2,0,2,0,0,2,2,0,...,5,2019-06-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,0,8,2020-02-01
25270,2827,21,1,0,1,0,0,1,1,0,...,1,2017-12-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-2,2,2018-02-01
43233,2827,25,3,0,1,0,0,0,5,0,...,7,2019-09-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,0,5,2020-02-01
22825,2827,21,3,0,1,0,0,3,4,1,...,12,2017-09-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-2,5,2018-02-01


In [167]:
df_workspace.loc[(df_workspace["user_ID"] == 2827) & (df_workspace["chapter_ID"] == 21)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
34371,2827,21,3,0,0,0,1,2,4,3,...,8,2018-11-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,3,2019-02-01
35234,2827,21,3,0,0,0,0,0,3,0,...,3,2018-12-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,2,2019-02-01
36097,2827,21,2,0,0,0,0,2,2,4,...,4,2019-01-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,1,2019-02-01


In [168]:
df_workspace.loc[(df_workspace["user_ID"] == 2827) & (df_workspace["chapter_ID"] == 25)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36229,2827,25,3,0,0,0,0,1,5,4,...,4,2019-01-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,-1,1,2019-02-01
37105,2827,25,3,1,0,0,0,2,3,2,...,5,2019-02-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,0,12,2020-02-01
38005,2827,25,2,0,0,0,2,1,1,3,...,4,2019-03-01,"Marketing, Graphic Designer",,2019-02-01,2020-02-01,2020-02-01,0,11,2020-02-01


In [169]:
df_workspace.loc[[36229], df_workspace.columns[2:15]] = df_workspace.loc[36229, df_workspace.columns[2:15]].values +\
    df_workspace.loc[36097, df_workspace.columns[2:15]].values
df_workspace.drop(36097, axis=0, inplace=True)

In [170]:
df_database.loc[df_database["user_ID"] == 2827, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2717,2019-02-01
3069,2017-08-01


In [171]:
df_workspace.loc[df_workspace["user_ID"] == 2827, "join_date"] = pd.Timestamp("2017-08-01")

In [173]:
df_dropped.loc[df_dropped["user_ID"] == 2827, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
345,2020-02-01
639,2019-01-16


In [172]:
df_workspace.loc[df_workspace["user_ID"] == 2827, "palms_date"].max()

Timestamp('2020-02-01 00:00:00')

In [175]:
df_workspace.loc[df_workspace["user_ID"] == 2827, "drop_date"] = pd.Timestamp("2020-02-01")

#### - User 938 

In [176]:
df_workspace.loc[df_workspace["user_ID"] == 938].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
40377,938,17,3,0,0,0,0,0,1,0,...,2,2019-06-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,11,2020-05-01
48624,938,17,5,0,0,0,0,1,2,0,...,6,2020-04-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,1,2020-05-01
32983,938,4,4,0,0,0,0,0,2,0,...,8,2018-10-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,-1,7,2019-05-01
42940,938,17,4,0,0,0,0,1,3,0,...,11,2019-09-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,8,2020-05-01
45317,938,17,2,0,1,0,1,1,1,1,...,3,2019-12-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,5,2020-05-01
24802,938,4,3,0,0,0,0,0,1,0,...,19,2017-12-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,-2,5,2018-05-01
47782,938,17,3,0,0,0,0,0,1,0,...,2,2020-03-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,2,2020-05-01
22343,938,4,4,0,0,0,0,1,1,0,...,12,2017-09-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,-2,8,2018-05-01


In [177]:
df_workspace.loc[(df_workspace["user_ID"] == 938) & (df_workspace["chapter_ID"] == 4)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
33833,938,4,4,0,0,0,0,1,0,0,...,3,2018-11-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,-1,6,2019-05-01
34660,938,4,1,0,2,0,0,3,0,0,...,7,2018-12-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,-1,5,2019-05-01
35519,938,4,2,1,0,0,0,0,1,1,...,0,2019-01-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,-1,4,2019-05-01


In [178]:
df_workspace.loc[(df_workspace["user_ID"] == 938) & (df_workspace["chapter_ID"] == 17)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
40377,938,17,3,0,0,0,0,0,1,0,...,2,2019-06-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,11,2020-05-01
41272,938,17,4,1,0,0,0,1,4,0,...,5,2019-07-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,10,2020-05-01
42118,938,17,3,1,0,0,0,0,2,0,...,7,2019-08-01,"Accounting, Accountant",,2019-05-01,2021-05-01,2020-08-07,0,9,2020-05-01


In [179]:
df_database.loc[df_database["user_ID"] == 938, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2126,2019-05-01
2899,2017-08-01


In [180]:
df_adjust_records.loc[938] = [-5, "2019-06-01", 3, "2017-08-01"]

In [181]:
df_workspace.loc[df_workspace["user_ID"] == 938, "join_date"] = pd.Timestamp("2017-08-01")

In [182]:
df_dropped.loc[df_dropped["user_ID"] == 938, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
243,2020-08-07


In [183]:
df_workspace.loc[df_workspace["user_ID"] == 938, "palms_date"].max()

Timestamp('2020-08-01 00:00:00')

#### - User 2347 

In [184]:
df_workspace.loc[df_workspace["user_ID"] == 2347].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
9639,2347,17,3,1,0,0,0,1,0,1,...,11,2016-04-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-4,2,2016-06-01
12561,2347,17,3,0,0,0,0,0,0,0,...,0,2016-08-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-3,10,2017-06-01
56476,2347,10,4,0,0,0,0,2,3,1,...,0,2021-02-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,1,4,2021-06-01
50092,2347,10,3,0,1,0,0,0,3,1,...,20,2020-06-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,1,12,2021-06-01
49282,2347,10,4,0,0,0,0,0,0,0,...,0,2020-05-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,0,1,2020-06-01
50893,2347,10,5,0,0,0,0,0,1,1,...,16,2020-07-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,1,11,2021-06-01
45156,2347,10,0,0,2,1,0,1,1,0,...,0,2019-12-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,0,6,2020-06-01
6209,2347,17,2,1,0,0,0,0,2,4,...,0,2015-11-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-4,7,2016-06-01


In [193]:
df_workspace.loc[(df_workspace["user_ID"] == 2347) & (df_workspace["chapter_ID"] == 17)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
11087,2347,17,5,0,0,0,0,1,1,4,...,0,2016-06-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-3,12,2017-06-01
11820,2347,17,4,0,0,0,0,0,0,0,...,0,2016-07-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-3,11,2017-06-01
12561,2347,17,3,0,0,0,0,0,0,0,...,0,2016-08-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,-3,10,2017-06-01


In [186]:
df_workspace.loc[(df_workspace["user_ID"] == 2347) & (df_workspace["chapter_ID"] == 10)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
41049,2347,10,3,0,0,0,0,2,0,4,...,0,2019-07-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,0,11,2020-06-01
41927,2347,10,4,0,0,1,0,1,0,1,...,5,2019-08-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,0,10,2020-06-01
42749,2347,10,2,1,0,0,1,0,0,3,...,0,2019-09-01,"Advertising & Marketing, Social Media",576.0,2019-06-01,2022-06-01,2016-08-09,0,9,2020-06-01


In [187]:
df_database.loc[df_database["user_ID"] == 2347, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1615,2019-06-01
2067,2015-06-01
2358,2014-08-01


In [191]:
df_adjust_records.loc[2347] = [-35, "2019-07-01", 0, ""]

In [194]:
df_workspace.loc[df_workspace["user_ID"] == 2347, "join_date"] = pd.Timestamp("2015-06-01")

In [196]:
df_dropped.loc[df_dropped["user_ID"] == 2347, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
1162,2016-08-09


In [195]:
df_workspace.loc[df_workspace["user_ID"] == 2347, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [198]:
df_workspace.loc[df_workspace["user_ID"] == 2347, "drop_date"] = pd.NaT

#### - User 2497 

In [199]:
df_workspace.loc[df_workspace["user_ID"] == 2497].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
46831,2497,14,1,0,0,0,0,0,0,0,...,0,2020-02-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-2,6,2020-08-01
54955,2497,14,5,0,0,0,0,0,1,4,...,0,2020-12-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-1,8,2021-08-01
48544,2497,14,4,0,0,0,0,1,0,2,...,9,2020-04-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-2,4,2020-08-01
47706,2497,14,4,0,0,0,0,3,1,4,...,38,2020-03-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-2,5,2020-08-01
50193,2497,14,4,0,0,0,0,1,2,2,...,3,2020-06-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-2,2,2020-08-01
50997,2497,14,3,0,0,0,0,0,1,2,...,0,2020-07-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-2,1,2020-08-01
58257,2497,14,3,0,1,0,0,0,7,9,...,9,2021-04-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-1,4,2021-08-01
59073,2497,14,1,0,1,0,0,0,1,0,...,1,2021-05-01,"Food & Beverage, Caterer",2070.0,2021-08-01,2022-08-01,2021-05-01,-1,3,2021-08-01


In [202]:
df_workspace.loc[df_workspace["user_ID"] == 2497, "palms_date"]

46831   2020-02-01
47706   2020-03-01
48544   2020-04-01
49378   2020-05-01
50193   2020-06-01
50997   2020-07-01
51805   2020-08-01
52603   2020-09-01
53401   2020-10-01
54188   2020-11-01
54955   2020-12-01
55781   2021-01-01
56608   2021-02-01
57430   2021-03-01
58257   2021-04-01
59073   2021-05-01
60528   2021-07-01
61362   2021-08-01
62196   2021-09-01
63002   2021-10-01
Name: palms_date, dtype: datetime64[ns]

In [204]:
df_database.loc[df_database["user_ID"] == 2497, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
479,2021-08-01
3214,2020-02-01


In [205]:
df_workspace.loc[df_workspace["user_ID"] == 2497, "join_date"] = pd.Timestamp("2020-02-01")

In [206]:
df_workspace.loc[df_workspace["user_ID"] == 2497, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [207]:
df_workspace.loc[df_workspace["user_ID"] == 2497, "drop_date"] = pd.NaT

#### - User 2208

In [208]:
df_workspace.loc[df_workspace["user_ID"] == 2208].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
13950,2208,11,1,1,0,0,0,0,0,0,...,0,2016-10-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-5,9,2017-07-01
24985,2208,11,3,0,0,0,0,1,1,0,...,4,2017-12-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-4,7,2018-07-01
15476,2208,11,2,0,0,0,0,0,0,0,...,0,2016-12-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-5,7,2017-07-01
14732,2208,11,5,0,0,0,0,3,1,1,...,6,2016-11-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-5,8,2017-07-01
16958,2208,11,4,0,0,0,0,0,1,0,...,0,2017-02-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-5,5,2017-07-01
17693,2208,11,4,0,0,0,0,2,8,2,...,14,2017-03-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-5,4,2017-07-01
59745,2208,10,2,0,0,0,0,2,0,0,...,1,2021-06-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,-1,1,2021-07-01
63055,2208,10,3,0,1,0,0,1,3,1,...,7,2021-10-01,"Construction, Flooring",576.0,2021-07-01,2022-07-01,NaT,0,9,2022-07-01


In [211]:
df_workspace.loc[df_workspace["user_ID"] == 2208, "palms_date"]

13950   2016-10-01
14732   2016-11-01
15476   2016-12-01
16219   2017-01-01
16958   2017-02-01
17693   2017-03-01
18445   2017-04-01
19185   2017-05-01
20019   2017-06-01
20857   2017-07-01
21682   2017-08-01
22534   2017-09-01
23360   2017-10-01
24181   2017-11-01
24985   2017-12-01
25780   2018-01-01
59745   2021-06-01
60581   2021-07-01
61417   2021-08-01
62251   2021-09-01
63055   2021-10-01
Name: palms_date, dtype: datetime64[ns]

In [209]:
df_database.loc[df_database["user_ID"] == 2208, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1642,2021-07-01
2280,2016-10-01


In [212]:
df_adjust_records.loc[2208] = [-30, "2021-06-01", 5, "2016-10-01"]

In [213]:
df_workspace.loc[df_workspace["user_ID"] == 2208, "join_date"] = pd.Timestamp("2016-10-01")

In [214]:
df_workspace.loc[df_workspace["user_ID"] == 2208, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [215]:
df_workspace.loc[df_workspace["user_ID"] == 2208, "drop_date"] = pd.NaT

#### - User 570

In [216]:
df_workspace.loc[df_workspace["user_ID"] == 570].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
21296,570,26,2,1,0,0,1,1,0,0,...,0,2017-07-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,0,1,2017-08-01
63553,570,26,4,0,0,0,0,3,5,0,...,4,2021-10-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,4,10,2022-08-01
32003,570,26,3,1,0,0,1,1,1,1,...,1,2018-08-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,1,12,2019-08-01
14502,570,4,5,0,0,0,0,0,3,0,...,2,2016-11-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,-1,9,2017-08-01
35381,570,26,3,0,0,0,0,3,7,2,...,2,2018-12-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,1,8,2019-08-01
23840,570,26,3,0,0,0,1,1,3,0,...,4,2017-10-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,0,10,2018-08-01
10786,570,4,4,0,0,0,0,1,1,0,...,1,2016-06-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,-2,2,2016-08-01
44091,570,26,4,1,0,0,0,2,4,0,...,7,2019-10-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,2,10,2020-08-01


In [217]:
df_workspace.loc[(df_workspace["user_ID"] == 570) & (df_workspace["chapter_ID"] == 4)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
18975,570,4,3,1,1,0,0,0,6,0,...,7,2017-05-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,-1,3,2017-08-01
19814,570,4,1,3,0,0,0,1,4,1,...,0,2017-06-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,-1,2,2017-08-01
20654,570,4,2,0,0,0,0,0,1,0,...,0,2017-07-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,0,1,2017-08-01


In [218]:
df_workspace.loc[(df_workspace["user_ID"] == 570) & (df_workspace["chapter_ID"] == 26)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
21296,570,26,2,1,0,0,1,1,0,0,...,0,2017-07-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,0,1,2017-08-01
22164,570,26,5,0,0,0,0,1,5,0,...,4,2017-08-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,0,12,2018-08-01
23001,570,26,3,0,0,0,1,1,0,0,...,0,2017-09-01,"Legal & Accounting, Real Estate Law",,2017-07-01,2022-08-01,NaT,0,11,2018-08-01


In [219]:
df_workspace.loc[[21296], df_workspace.columns[2:15]] = df_workspace.loc[21296, df_workspace.columns[2:15]].values +\
    df_workspace.loc[20654, df_workspace.columns[2:15]].values
df_workspace.drop(20654, axis=0, inplace=True)

In [220]:
df_database.loc[df_database["user_ID"] == 570, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1169,2017-07-01
2866,2016-03-01


In [221]:
df_adjust_records.loc[570] = [0, "", 0, "fix relative renewal date"]

In [222]:
df_workspace.loc[df_workspace["user_ID"] == 570, "join_date"] = pd.Timestamp("2016-03-01")

In [223]:
df_workspace.loc[df_workspace["user_ID"] == 570, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [224]:
df_workspace.loc[df_workspace["user_ID"] == 570, "drop_date"] = pd.NaT

#### - User 2387 

In [225]:
df_workspace.loc[df_workspace["user_ID"] == 2387].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
26573,2387,9,4,0,0,0,0,2,3,1,...,8,2018-02-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,1,4,2018-06-01
36580,2387,9,4,0,0,0,0,4,6,0,...,13,2019-02-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,2,4,2019-06-01
17637,2387,9,5,0,0,0,0,0,7,2,...,9,2017-03-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,0,3,2017-06-01
4985,2387,17,5,0,0,0,0,3,2,0,...,4,2015-09-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-1,9,2016-06-01
29002,2387,9,5,0,0,0,0,3,1,3,...,13,2018-05-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,1,1,2018-06-01
59727,2387,9,4,0,0,0,0,0,0,0,...,0,2021-06-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,4,12,2022-06-01
46709,2387,9,4,0,0,0,0,1,3,1,...,8,2020-02-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,3,4,2020-06-01
5596,2387,17,4,0,0,0,0,0,2,0,...,0,2015-10-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-1,8,2016-06-01


In [227]:
df_workspace.loc[(df_workspace["user_ID"] == 2387) & (df_workspace["chapter_ID"] == 17)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
7515,2387,17,4,0,0,0,0,2,1,0,...,14,2016-01-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-1,5,2016-06-01
8199,2387,17,4,0,0,0,0,1,2,0,...,8,2016-02-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-1,4,2016-06-01
8936,2387,17,3,0,0,0,0,1,1,1,...,14,2016-03-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-1,3,2016-06-01


In [228]:
df_workspace.loc[(df_workspace["user_ID"] == 2387) & (df_workspace["chapter_ID"] == 9)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
11638,2387,9,2,0,0,0,0,0,1,1,...,0,2016-07-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,-1,11,2017-06-01
12379,2387,9,4,0,0,0,0,0,4,2,...,9,2016-08-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,0,10,2017-06-01
13122,2387,9,5,0,0,0,0,1,2,1,...,0,2016-09-01,"Advertising & Marketing, Copywriter/Writer",,2016-08-01,2021-06-01,2021-06-01,0,9,2017-06-01


In [229]:
df_database.loc[df_database["user_ID"] == 2387, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
408,2016-08-01
2022,2013-04-01


In [230]:
df_adjust_records.loc[2387] = [-4, "2016-07-01", 4, "2013-04-01"]

In [231]:
df_workspace.loc[df_workspace["user_ID"] == 2387, "join_date"] = pd.Timestamp("2013-04-01")

In [233]:
df_dropped.loc[df_dropped["user_ID"] == 2387, ["drop_date"]].drop_duplicates()

Unnamed: 0,drop_date
78,2021-06-01
1240,2016-03-14


In [232]:
df_workspace.loc[df_workspace["user_ID"] == 2387, "palms_date"].max()

Timestamp('2021-06-01 00:00:00')

In [234]:
df_workspace.loc[df_workspace["user_ID"] == 2387, "drop_date"] = pd.Timestamp("2021-06-01")

#### - User 637

In [235]:
df_workspace.loc[df_workspace["user_ID"] == 637].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
45339,637,17,0,1,0,0,0,0,0,0,...,0,2019-12-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,-1,12,2020-12-01
58089,637,9,5,0,0,0,0,0,22,0,...,5,2021-04-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,1,8,2021-12-01
52464,637,9,4,0,0,0,0,0,12,4,...,3,2020-09-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,0,3,2020-12-01
61390,637,9,3,1,0,0,0,1,3,1,...,2,2021-08-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,1,4,2021-12-01
50861,637,9,5,0,0,0,0,0,24,1,...,4,2020-07-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,0,5,2020-12-01
49248,637,9,0,0,0,4,0,0,0,0,...,0,2020-05-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,0,7,2020-12-01
57262,637,9,3,1,0,0,0,0,9,1,...,8,2021-03-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,1,9,2021-12-01
40400,637,17,3,1,0,0,0,1,5,2,...,9,2019-06-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,-1,6,2019-12-01


In [236]:
df_workspace.loc[(df_workspace["user_ID"] == 637) & (df_workspace["chapter_ID"] == 17)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
43785,637,17,4,0,0,0,1,1,10,0,...,6,2019-10-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,-1,2,2019-12-01
44563,637,17,0,2,0,0,2,0,0,0,...,2,2019-11-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,-1,1,2019-12-01
45339,637,17,0,1,0,0,0,0,0,0,...,0,2019-12-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,-1,12,2020-12-01


In [237]:
df_workspace.loc[(df_workspace["user_ID"] == 637) & (df_workspace["chapter_ID"] == 9)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
46700,637,9,3,0,0,0,0,1,8,1,...,2,2020-02-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,0,10,2020-12-01
47574,637,9,3,0,0,0,0,0,5,1,...,2,2020-03-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,0,9,2020-12-01
48414,637,9,2,0,0,3,0,0,0,0,...,0,2020-04-01,"Construction, Counter Tops",,2020-02-01,2021-12-01,2021-11-03,0,8,2020-12-01


In [238]:
df_database.loc[df_database["user_ID"] == 637, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
457,2020-02-01
2116,2018-10-01


In [239]:
df_adjust_records.loc[637] = [-2, "2020-02-01", 8, "2018-10-01"]

In [240]:
df_workspace.loc[df_workspace["user_ID"] == 637, "join_date"] = pd.Timestamp("2018-10-01")

In [241]:
df_workspace.loc[df_workspace["user_ID"] == 637, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [242]:
df_workspace.loc[df_workspace["user_ID"] == 637, "drop_date"] = pd.NaT

#### - User 1486

In [243]:
df_workspace.loc[df_workspace["user_ID"] == 1486].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
25590,1486,4,5,0,0,0,0,0,3,0,...,4,2018-01-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-2,1,2018-02-01
20651,1486,4,4,0,0,0,0,2,0,1,...,2,2017-07-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-2,7,2018-02-01
21484,1486,4,5,0,0,0,0,0,2,0,...,6,2017-08-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-2,6,2018-02-01
28054,1486,4,3,1,0,0,0,3,1,0,...,4,2018-04-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-1,10,2019-02-01
29699,1486,4,3,0,0,0,0,0,2,0,...,2,2018-06-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-1,8,2019-02-01
23164,1486,4,4,1,0,0,0,2,1,0,...,1,2017-10-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-2,4,2018-02-01
23993,1486,4,4,0,0,0,0,0,2,0,...,7,2017-11-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-2,3,2018-02-01
19811,1486,4,3,0,0,0,0,0,0,0,...,0,2017-06-01,"Trades, Painter",2466.0,2019-02-01,2020-02-01,NaT,-2,8,2018-02-01


In [244]:
df_database.loc[df_database["user_ID"] == 1486, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
1386,2019-02-01
2895,2017-06-01


In [246]:
df_workspace.loc[df_workspace["user_ID"] == 1486, "palms_date"]

19811   2017-06-01
20651   2017-07-01
21484   2017-08-01
22340   2017-09-01
23164   2017-10-01
23993   2017-11-01
24799   2017-12-01
25590   2018-01-01
26434   2018-02-01
27250   2018-03-01
28054   2018-04-01
28863   2018-05-01
29699   2018-06-01
Name: palms_date, dtype: datetime64[ns]

In [194]:
df_workspace.loc[df_workspace["user_ID"] == 1486, "join_date"] = pd.Timestamp("2017-06-01")

In [248]:
df_workspace.loc[df_workspace["user_ID"] == 1486, "drop_date"] = pd.Timestamp("2018-06-01")

#### - User 32 

In [249]:
df_workspace.loc[df_workspace["user_ID"] == 32].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
36218,32,25,4,0,0,0,0,0,4,1,...,3,2019-01-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,11,2019-12-01
39775,32,25,3,0,0,0,2,2,0,1,...,7,2019-05-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,7,2019-12-01
38884,32,25,4,0,0,0,0,0,0,0,...,5,2019-04-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,8,2019-12-01
49715,32,25,4,0,0,0,0,1,3,0,...,3,2020-05-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,1,7,2020-12-01
35355,32,25,3,0,0,0,0,0,1,1,...,0,2018-12-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,12,2019-12-01
59381,32,25,4,0,0,0,0,1,4,0,...,5,2021-05-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,2,7,2021-12-01
41543,32,25,4,0,0,0,0,0,4,0,...,2,2019-07-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,5,2019-12-01
30534,32,4,4,0,0,0,1,0,3,0,...,0,2018-07-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,-1,5,2018-12-01


In [250]:
df_workspace.loc[(df_workspace["user_ID"] == 32) & (df_workspace["chapter_ID"] == 4)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
32133,32,4,4,0,0,0,0,0,1,0,...,0,2018-09-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,-1,3,2018-12-01
32985,32,4,4,0,0,0,0,0,1,1,...,5,2018-10-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,-1,2,2018-12-01
33835,32,4,3,1,0,0,0,0,0,0,...,0,2018-11-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,1,2018-12-01


In [251]:
df_workspace.loc[(df_workspace["user_ID"] == 32) & (df_workspace["chapter_ID"] == 25)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
35355,32,25,3,0,0,0,0,0,1,1,...,0,2018-12-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,12,2019-12-01
36218,32,25,4,0,0,0,0,0,4,1,...,3,2019-01-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,11,2019-12-01
37094,32,25,4,0,0,0,0,2,2,1,...,4,2019-02-01,"Computer & Programming, IT & Networks",,2018-11-01,2021-12-01,NaT,0,10,2019-12-01


In [252]:
df_database.loc[df_database["user_ID"] == 32, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
2715,2018-11-01
2904,2017-12-01


In [253]:
df_workspace.loc[df_workspace["user_ID"] == 32, "join_date"] = pd.Timestamp("2017-12-01")

In [254]:
df_workspace.loc[df_workspace["user_ID"] == 32, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [255]:
df_workspace.loc[df_workspace["user_ID"] == 32, "drop_date"] = pd.NaT

#### - User 2288

In [256]:
df_workspace.loc[df_workspace["user_ID"] == 2288].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
6900,2288,19,3,0,0,0,0,1,1,0,...,0,2015-12-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-6,10,2016-10-01
3139,2288,19,4,0,0,0,0,4,1,2,...,0,2015-06-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-7,4,2015-10-01
3784,2288,19,3,0,0,0,0,3,2,1,...,12,2015-07-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-7,3,2015-10-01
8998,2288,19,5,0,0,0,0,1,0,2,...,0,2016-03-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-6,7,2016-10-01
63137,2288,12,2,0,0,0,0,1,0,0,...,0,2021-10-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,0,12,2022-10-01
5031,2288,19,5,0,0,0,0,2,0,2,...,15,2015-09-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-7,1,2015-10-01
5641,2288,19,4,0,0,0,0,2,2,1,...,0,2015-10-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-6,12,2016-10-01
2515,2288,19,1,0,0,0,0,0,0,1,...,0,2015-05-01,"Finance & Insurance, Group Benefits",1730.0,2021-10-01,2022-10-01,2016-04-06,-7,5,2015-10-01


In [257]:
df_database.loc[df_database["user_ID"] == 2288, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
787,2021-10-01
2478,2015-05-01


In [258]:
df_workspace.loc[df_workspace["user_ID"] == 2288, "join_date"] = pd.Timestamp("2015-05-01")

In [259]:
df_workspace.loc[df_workspace["user_ID"] == 2288, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [260]:
df_workspace.loc[df_workspace["user_ID"] == 2288, "drop_date"] = pd.NaT

#### - User 504

In [261]:
df_workspace.loc[df_workspace["user_ID"] == 504].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
8996,504,19,3,1,0,0,1,0,2,0,...,8,2016-03-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,-1,6,2016-09-01
44318,504,9,4,0,0,0,0,0,5,2,...,4,2019-11-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,2,10,2020-09-01
19921,504,9,4,1,0,0,0,0,2,1,...,2,2017-06-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,0,3,2017-09-01
47535,504,9,3,0,0,0,0,0,2,1,...,2,2020-03-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,3,6,2020-09-01
24890,504,9,3,0,0,0,0,1,7,0,...,2,2017-12-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,1,9,2018-09-01
22444,504,9,4,0,0,0,0,2,1,2,...,2,2017-09-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,0,12,2018-09-01
8255,504,19,4,0,0,0,0,1,3,0,...,0,2016-02-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,-1,7,2016-09-01
5639,504,19,4,0,0,0,0,0,1,0,...,0,2015-10-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,-2,11,2016-09-01


In [262]:
df_workspace.loc[(df_workspace["user_ID"] == 504) & (df_workspace["chapter_ID"] == 19)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
8255,504,19,4,0,0,0,0,1,3,0,...,0,2016-02-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,-1,7,2016-09-01
8996,504,19,3,1,0,0,1,0,2,0,...,8,2016-03-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,-1,6,2016-09-01
9707,504,19,2,0,0,0,0,0,0,0,...,0,2016-04-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,-1,5,2016-09-01


In [263]:
df_workspace.loc[(df_workspace["user_ID"] == 504) & (df_workspace["chapter_ID"] == 9)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
15373,504,9,1,0,0,0,0,0,0,0,...,0,2016-12-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,0,9,2017-09-01
16117,504,9,4,0,0,0,0,0,1,0,...,1,2017-01-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,0,8,2017-09-01
16862,504,9,4,0,0,0,0,1,3,0,...,2,2017-02-01,"Legal & Accounting, Real Estate Law",832.0,2016-12-01,2022-09-01,2016-04-12,0,7,2017-09-01


In [264]:
df_database.loc[df_database["user_ID"] == 504, ["join_date"]].drop_duplicates()

Unnamed: 0,join_date
413,2016-12-01
2502,2015-07-01


In [265]:
df_workspace.loc[df_workspace["user_ID"] == 504, "join_date"] = pd.Timestamp("2015-07-01")

In [266]:
df_workspace.loc[df_workspace["user_ID"] == 504, "palms_date"].max()

Timestamp('2021-10-01 00:00:00')

In [267]:
df_workspace.loc[df_workspace["user_ID"] == 504, "drop_date"] = pd.NaT

#### - User 2932

In [268]:
df_workspace.loc[df_workspace["user_ID"] == 2932].sample(8, random_state=23)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
55711,2932,12,3,0,1,0,0,6,2,10,...,25,2021-01-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,4,2021-05-01
45312,2932,17,3,1,0,0,0,0,1,4,...,40,2019-12-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,-1,5,2020-05-01
59821,2932,12,4,0,0,0,1,4,25,13,...,36,2021-06-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,1,11,2022-05-01
60660,2932,12,4,0,0,0,0,0,2,4,...,0,2021-07-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,1,10,2022-05-01
44535,2932,17,4,0,0,0,0,7,7,1,...,0,2019-11-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,-1,6,2020-05-01
63135,2932,12,3,0,0,0,1,8,30,5,...,30,2021-10-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,1,7,2022-05-01
52560,2932,12,2,0,3,0,0,0,2,6,...,0,2020-09-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,8,2021-05-01
53360,2932,12,1,0,3,0,0,13,11,3,...,53,2020-10-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,7,2021-05-01


In [269]:
df_workspace.loc[(df_workspace["user_ID"] == 2932) & (df_workspace["chapter_ID"] == 17)].tail(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
47777,2932,17,1,0,2,0,0,8,7,0,...,29,2020-03-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,-1,2,2020-05-01
48619,2932,17,4,1,0,0,0,8,5,1,...,12,2020-04-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,-1,1,2020-05-01
49454,2932,17,0,1,0,0,0,0,0,0,...,0,2020-05-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,12,2021-05-01


In [270]:
df_workspace.loc[(df_workspace["user_ID"] == 2932) & (df_workspace["chapter_ID"] == 12)].head(3)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,industry,sponsor_ID,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
49337,2932,12,4,0,0,0,0,3,4,4,...,45,2020-05-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,12,2021-05-01
50152,2932,12,3,0,1,0,0,4,9,1,...,29,2020-06-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,11,2021-05-01
50954,2932,12,2,1,1,0,0,0,4,4,...,10,2020-07-01,"Health & Wellness, Massage Therapist",,2020-05-01,2022-05-01,2020-05-03,0,10,2021-05-01


#### - User 2889

#### - User 7

#### - User 2150

#### - User 1703

#### - User 1140

#### - User 1324

#### - User 508

#### - User 2838

#### - User 2673

#### - User 1654

### This TEST

In [211]:
df_master_clean.head()

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,TYFCB,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,relative_renewal_date
0,733,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Food/Beverages, Caterer",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
2,1150,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Trades, Heating & A/C",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
3,414,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Insurance, Group Benefits Consultant",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
4,1721,1,3,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Marketing, Marketing Services",2016-04-01,2017-04-01,NaT,-1,1,2016-04-01
5,1919,1,4,0,0,0,0,0,0,0,...,0,0,2016-03-01,"Real Estate, Real Estate Sales Representative ...",2016-04-01,2018-12-01,NaT,-1,9,2016-12-01


In [212]:
df_master_clean.shape[0]

29207

In [213]:
df_master_clean.drop_duplicates(subset=["user_ID", "chapter_ID", "palms_date"], inplace=True)
df_master_clean.shape[0]

29185

In [215]:
df_master_clean.to_csv("data/df_master_Cleaned_NoAgg.csv")

## Re-calculate membership year

## Delete unnecessary columns

## Aggregate 9-months data
Ensure sure that each groupby sum is aggregated for 9 months, not less.

In [38]:
df_9_months = df_master_clean.copy()

# df_9_months = df_9_months.loc[df_9_months["months_to_renewal"] >= 4]

# df_3_months.drop(["years_to_renewal", "months_to_renewal"], axis=1, inplace=True)

Unnamed: 0,user_ID,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,...,CEU,palms_date,profession,join_date,renewal_date,drop_date,year_of_membership,months_to_renewal,years_to_renewal,next_renewal_date
37248,734,27,1,0,0,0,0,0,0,0,...,0,2020-02-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,-1,6,1,2020-08-01
38028,734,27,4,0,0,0,0,2,3,0,...,11,2020-03-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,0,5,1,2020-08-01
38803,822,27,5,0,0,0,0,3,0,2,...,0,2020-04-01,"Office, Commercial Painting",2019-10-01,2020-10-01,NaT,0,6,0,2020-10-01
38809,2904,27,5,0,0,0,0,0,2,0,...,7,2020-04-01,"Legal & Accounting, Bookkeeping",2020-03-01,2021-10-01,NaT,0,6,1,2020-10-01
38816,734,27,5,0,0,0,0,3,3,2,...,13,2020-04-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,0,4,1,2020-08-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46298,734,27,4,0,0,0,0,0,4,5,...,8,2021-02-01,"Health & Wellness, Massage Therapist",2020-03-01,2021-08-01,NaT,0,6,0,2021-08-01
46299,1580,27,4,0,0,0,0,1,4,1,...,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,5,0,2021-07-01
46300,1580,27,4,0,0,0,0,1,4,1,...,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-09-28,0,5,0,2021-07-01
46301,1580,27,4,0,0,0,0,1,4,1,...,3,2021-02-01,"Advertising & Marketing, Photographer",2020-07-01,2021-07-01,2020-03-18,0,5,0,2021-07-01


In [21]:
# group by "renewal_date"
# The rest of the columns listed in groupby remain the same for the given time period.
test = df_3_months.copy()
test = test.groupby(["user_ID", "renewal_date"]).sum()
# test.reset_index(level=["renewal_date"], inplace=True)
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,chapter_ID,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU,year_of_membership
user_ID,renewal_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,2017-05-01,51,11,0,0,0,0,1,13,5,12,0,10,4503,1,3
2,2018-05-01,51,12,0,0,0,0,2,3,2,6,0,12,4027,4,6
2,2019-05-01,51,10,0,0,0,0,2,3,3,4,1,5,1445,2,9
2,2020-05-01,51,10,0,0,0,1,4,3,6,27,3,9,1873,9,12
2,2021-05-01,51,9,1,0,0,0,0,9,5,4,0,4,1000,1,15


In [125]:
test.loc[test["user_ID"] == 420]

Unnamed: 0_level_0,user_ID,chapter_ID,profession,membership_length,P,A,L,M,S,RGI,RGO,RRI,RRO,V,1-2-1,TYFCB,CEU
renewal_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2016-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",17,4,1,0,0,0,1,1,0,0,0,1,3195,1
2017-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",18,9,1,0,0,3,1,2,2,1,0,16,7421,3
2018-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",19,11,1,0,0,0,9,8,1,8,1,15,21651,23
2019-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",20,9,1,0,0,2,1,8,1,18,1,7,3734,11
2020-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",21,11,0,0,0,0,2,14,0,8,1,7,1796,14
2021-07-01,420,8,"Finance & Insurance, Financial Advisor/Financi...",22,7,0,0,0,0,2,11,2,7,1,9,3891,11
