In [29]:
#imports
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

##Initial data analysis and cleaning

In [30]:
#reading the csv file
dir = Path('data\2023 HCP Case Data - Sheet1.csv').resolve().parents[1]
file_path = dir / 'data' / '2023 HCP Case Data - Sheet1.csv'
df = pd.read_csv(file_path, sep=',')
#print(df.shape)
#df.head(5)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16680 entries, 0 to 16679
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   OrgID                16680 non-null  int64  
 1   EnrollDate           16680 non-null  object 
 2   ChurnDate            6703 non-null   object 
 3   OrgSize              16671 non-null  object 
 4   Industry             16655 non-null  object 
 5   IndustryGroup        16680 non-null  object 
 6   Enrollment Plan      16680 non-null  object 
 7   PromoType            16680 non-null  object 
 8   Acquisition Channel  15947 non-null  object 
 9   F28 Active Rate      16680 non-null  object 
 10  Avg Lifetime Active  16680 non-null  object 
 11  F28 Cc Flag          16680 non-null  int64  
 12  eLTV/CAC             16549 non-null  float64
dtypes: float64(1), int64(2), object(10)
memory usage: 1.7+ MB


In [32]:
#data cleaning
df['EnrollDate'] = pd.to_datetime(df['EnrollDate'])
df['ChurnDate'] = pd.to_datetime(df['ChurnDate'])
df['F28 Active Rate'] = df['F28 Active Rate'].str.replace('%', '').astype(float) / 100
df['Avg Lifetime Active'] = df['Avg Lifetime Active'].str.replace('%', '').astype(float) / 100
df['Enrollment Plan'] = df['Enrollment Plan'].str.capitalize()
df['Enrollment Plan'] = pd.Categorical(df['Enrollment Plan'], categories=['Starter', 'Small', 'Medium', 'Large', 'Extra large'], ordered=True)
df['OrgSize'] = pd.Categorical(df['OrgSize'], categories=['0-1', '2-6', '7-10', '11+'], ordered=True) #there are NaN values in this column, probably due to some input error
df['PromoType'] = pd.Categorical(df['PromoType'], categories=['No Promo', 'One Month', 'Special - One Month', '2+ month'], ordered=True)
df['Churned'] = df['ChurnDate'].notnull().astype(int)
df['Churned'] = pd.Categorical(df['Churned'], categories=[0, 1], ordered=True)
#droping rows

#droping Enrollment Plan = Freeze, only 4 orgIDS, not enough to make a good analysis
i = df[df['Enrollment Plan'] == 'Freeze'].index
df.drop(labels = i, axis = 0, inplace=True)




In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16680 entries, 0 to 16679
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   OrgID                16680 non-null  int64         
 1   EnrollDate           16680 non-null  datetime64[ns]
 2   ChurnDate            6703 non-null   datetime64[ns]
 3   OrgSize              16671 non-null  category      
 4   Industry             16655 non-null  object        
 5   IndustryGroup        16680 non-null  object        
 6   Enrollment Plan      16677 non-null  category      
 7   PromoType            16680 non-null  category      
 8   Acquisition Channel  15947 non-null  object        
 9   F28 Active Rate      16680 non-null  float64       
 10  Avg Lifetime Active  16680 non-null  float64       
 11  F28 Cc Flag          16680 non-null  int64         
 12  eLTV/CAC             16549 non-null  float64       
 13  Churned              16680 non-

In [34]:
df.describe(include='all')

Unnamed: 0,OrgID,EnrollDate,ChurnDate,OrgSize,Industry,IndustryGroup,Enrollment Plan,PromoType,Acquisition Channel,F28 Active Rate,Avg Lifetime Active,F28 Cc Flag,eLTV/CAC,Churned
count,16680.0,16680,6703,16671,16655,16680,16677,16680,15947,16680.0,16680.0,16680.0,16549.0,16680.0
unique,,,,4,97,13,5,4,7,,,,,2.0
top,,,,0-1,Heating & Air Conditioning,Other,Small,One Month,Sales,,,,,0.0
freq,,,,8290,2974,4236,10010,10225,6002,,,,,9977.0
mean,350416.848561,2021-05-09 15:25:12.517985792,2021-07-08 10:14:11.797702400,,,,,,,0.283231,0.247089,0.258153,3.349375,
min,44.0,2020-01-04 00:00:00,2020-01-10 00:00:00,,,,,,,0.0,0.0,0.0,0.1,
25%,352488.0,2021-03-03 18:00:00,2021-05-10 00:00:00,,,,,,,0.036,0.011,0.0,1.2,
50%,368231.0,2021-05-27 00:00:00,2021-08-08 00:00:00,,,,,,,0.143,0.078,0.0,2.2,
75%,390281.25,2021-08-20 00:00:00,2021-10-19 00:00:00,,,,,,,0.5,0.464,1.0,3.8,
max,410653.0,2022-10-28 00:00:00,2022-12-09 00:00:00,,,,,,,1.0,1.0,1.0,40.0,


In [35]:
#create field to see how long a customer has been enrolled
df['Enrollment Duration'] = (df['ChurnDate'] - df['EnrollDate']).dt.days
max_date = df['ChurnDate'].max()
print(f'Max Date: {max_date}')
df['Enrollment Duration'] = df['Enrollment Duration'].fillna((max_date - df['EnrollDate']).dt.days)

#creat unique identifier of users
df['User_id'] = df['OrgID'].astype(str) + df['EnrollDate'].dt.strftime('%Y%m%d')
df.head()

Max Date: 2022-12-09 00:00:00


Unnamed: 0,OrgID,EnrollDate,ChurnDate,OrgSize,Industry,IndustryGroup,Enrollment Plan,PromoType,Acquisition Channel,F28 Active Rate,Avg Lifetime Active,F28 Cc Flag,eLTV/CAC,Churned,Enrollment Duration,User_id
0,44,2021-02-01,NaT,0-1,Flooring,Other,Extra large,No Promo,Marketing - Paid,0.607,0.096,0,4.1,0,676.0,4420210201
1,70,2022-03-01,NaT,0-1,Plumbing,Plumbing,Starter,No Promo,,0.071,0.34,0,1.2,0,283.0,7020220301
2,1494,2020-10-02,NaT,0-1,Carpet Cleaning,Carpet Cleaning,Small,No Promo,Product,1.0,0.379,1,3.4,0,798.0,149420201002
3,1604,2020-10-08,2020-12-08,11+,Restoration,Other,Large,2+ month,Sales,0.5,0.004,0,6.2,1,61.0,160420201008
4,1652,2020-02-03,2020-03-02,11+,Solar & Energy,Other,Large,One Month,Sales,0.0,0.001,0,4.7,1,28.0,165220200203


In [36]:
#save cleaned data into a csv file and a pickle file

df.to_csv(dir / 'data' / 'cleaned_data.csv', index=False, sep=',')
df.to_pickle(dir / 'data' / 'cleaned_data.pkl')