The following code predicts Day 30 LTV for each user on Day 1 based on only 4 features using a Random Forest Regression with 95% accuracy and an r2 value of .99

In [1]:
# # Importing the necessary libraries
import pandas as pd
import numpy as np 
import seaborn as sns

# Importing the csv data and exploring the data structure
df = pd.read_csv("d30_ltv_data.csv")
df.head()

Unnamed: 0,channel,state,onboarding_completed,d0_payments,d30_payments
0,Stripe,California,Y,10,45
1,Instagram,Washington,N,0,6
2,Facebook,Oregon,Y,10,17
3,Shopify,California,Y,10,20
4,Stripe,California,N,0,9


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   channel               211 non-null    object
 1   state                 211 non-null    object
 2   onboarding_completed  211 non-null    object
 3   d0_payments           211 non-null    int64 
 4   d30_payments          211 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 8.4+ KB


In [3]:
# Random Forest can only handle numbers, so we need to import a library that helps us encode all values into numbers
from sklearn.preprocessing import LabelEncoder

# Now we're using the encoder to convert string columns into numbers
channel_enc = LabelEncoder()
state_enc = LabelEncoder()
onboarding_completed_enc = LabelEncoder()

# Creating new columns that match the encoded values from above
df['channel_values'] = channel_enc.fit_transform(df['channel'])
df['state_values'] = state_enc.fit_transform(df['state'])
df['onboarding_completed_values'] = onboarding_completed_enc.fit_transform(df['onboarding_completed'])

# Checking to ensure our encoding worked properly on our independent variable set
df.head()

Unnamed: 0,channel,state,onboarding_completed,d0_payments,d30_payments,channel_values,state_values,onboarding_completed_values
0,Stripe,California,Y,10,45,3,0,1
1,Instagram,Washington,N,0,6,1,2,0
2,Facebook,Oregon,Y,10,17,0,1,1
3,Shopify,California,Y,10,20,2,0,1
4,Stripe,California,N,0,9,3,0,0


In [4]:
# Creating a new dataframe that drops all the duplicated columns that have now been encoded into numbers
df_clean = df.drop(['channel','state','onboarding_completed'],axis=1)

# Checking it all looks good and we haven't missed anything
df_clean.head()

Unnamed: 0,d0_payments,d30_payments,channel_values,state_values,onboarding_completed_values
0,10,45,3,0,1
1,0,6,1,2,0
2,10,17,0,1,1
3,10,20,2,0,1
4,0,9,3,0,0


In [5]:
# Creating the feature selection for the regressor and the target variable we're trying to predict

# we need to remove the 'answer' from the feature selection to train the model properly so we drop the target variable
X = df_clean.drop(['d30_payments'],axis = 'columns')
y = df_clean['d30_payments']

# Showing that the field we're predicting has been removed
X.head()

Unnamed: 0,d0_payments,channel_values,state_values,onboarding_completed_values
0,10,3,0,1
1,0,1,2,0
2,10,0,1,1
3,10,2,0,1
4,0,3,0,0


In [6]:
y.head()

0    45
1     6
2    17
3    20
4     9
Name: d30_payments, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=10) 
# test size means 20% of dataset will be withheld for testing purposes
# random state means if we run this all again it will start from same place and be verifiable

In [8]:
print(y_train.info())
print(y_test.info())

<class 'pandas.core.series.Series'>
Index: 168 entries, 205 to 9
Series name: d30_payments
Non-Null Count  Dtype
--------------  -----
168 non-null    int64
dtypes: int64(1)
memory usage: 2.6 KB
None
<class 'pandas.core.series.Series'>
Index: 43 entries, 83 to 55
Series name: d30_payments
Non-Null Count  Dtype
--------------  -----
43 non-null     int64
dtypes: int64(1)
memory usage: 688.0 bytes
None


In [9]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
rfr = RandomForestRegressor(random_state=10)

In [11]:
rfr.fit(X_train, y_train)

In [12]:
y_pred = rfr.predict(X_test)
print(y_pred)

[18.         40.          7.30096338 23.6356746  12.01       51.86
 20.70918254  7.30096338  2.04076408  7.30096338  2.04076408 40.
  2.12        2.04076408 18.         43.58333333 51.86       23.6356746
  9.99006999 23.6356746   2.04076408 15.66       19.         44.244
  2.04076408 23.82030159  2.          9.99006999 43.58333333  2.04076408
 19.          2.04076408  1.16057544  7.30096338 18.38780952  9.99006999
  7.30096338 15.66       19.          1.16057544 12.01        2.
  2.        ]


In [13]:
pred_accuracy = pd.DataFrame(y_test)

In [14]:
pred_accuracy['predictions']= y_pred

In [15]:
pred_accuracy['diff'] = pred_accuracy['predictions'] - pred_accuracy['d30_payments']

In [16]:
pred_accuracy['pct_diff'] = (pred_accuracy['diff']/pred_accuracy['d30_payments'])*100

In [17]:
print(pred_accuracy)

     d30_payments  predictions      diff   pct_diff
83             18    18.000000  0.000000   0.000000
183            40    40.000000  0.000000   0.000000
26              7     7.300963  0.300963   4.299477
197            23    23.635675  0.635675   2.763803
176            12    12.010000  0.010000   0.083333
68             52    51.860000 -0.140000  -0.269231
59             19    20.709183  1.709183   8.995698
24              7     7.300963  0.300963   4.299477
49              2     2.040764  0.040764   2.038204
61              7     7.300963  0.300963   4.299477
46              2     2.040764  0.040764   2.038204
127            40    40.000000  0.000000   0.000000
52              2     2.120000  0.120000   6.000000
105             2     2.040764  0.040764   2.038204
203            18    18.000000  0.000000   0.000000
168            44    43.583333 -0.416667  -0.946970
163            52    51.860000 -0.140000  -0.269231
47             23    23.635675  0.635675   2.763803
19          

In [18]:
# Calculating the accuracy of the model

# Creating some basic building blocks for the variance calculations
count = pred_accuracy['predictions'].count()
less_than_three = (pred_accuracy['pct_diff'] <= 3).sum()
less_than_five = (pred_accuracy['pct_diff'] <= 5).sum()
less_than_ten = (pred_accuracy['pct_diff'] <= 10).sum()
three_pct = round((less_than_three/count)*100,2)
five_pct = round((less_than_five/count)*100,2)
ten_pct = round((less_than_ten/count)*100,2)

# Summary of TRAINING Model Accuracy
print( )
print("Summary of TRAINING Model Accuracy:")
print("Count of Total Observations in Model: " + str(count))
print("Total # of predictions <= 10% variance= " + str(less_than_ten) + "; " + str(ten_pct) + "% of observations")
print("Total # of predictions <= 5% variance= " + str(less_than_five) + "; " + str(five_pct) + "% of observations")
print("Total # of predictions <= 3% variance= " + str(less_than_three) + "; " + str(three_pct) + "% of observations")
print( )


Summary of TRAINING Model Accuracy:
Count of Total Observations in Model: 43
Total # of predictions <= 10% variance= 41; 95.35% of observations
Total # of predictions <= 5% variance= 39; 90.7% of observations
Total # of predictions <= 3% variance= 34; 79.07% of observations



In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
# The goal of the MAE is to get it as close to 0 as possible
mean_absolute_error(y_pred,y_test)

np.float64(0.325883794682671)

In [21]:
# The goal of the MSE is to get it as close to 0 as possible
mean_squared_error(y_pred,y_test)

np.float64(0.3259505604841193)

In [22]:
# The goal of the r^2 score is to get it as close to 1 as possible
r2_score(y_pred,y_test)

0.9985162282487894