In [None]:
#Loading and viewing Data
#I will load the date file for this example and checkout summary statistics and columns for that file.

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import sklearn.metrics

prospect_data = pd.read_csv("browsing2.csv")

prospect_data.dtypes

SESSION_ID         int64
IMAGES             int64
REVIEWS            int64
FAQ                int64
SPECS              int64
SHIPPING           int64
BOUGHT_TOGETHER    int64
COMPARE_SIMILAR    int64
VIEW_WARRANTY      int64
SPONSORED_LINKS    int64
PURCHASE           int64
dtype: object

In [2]:
#sample of the top records to understand how the date looks like.
prospect_data.head()

Unnamed: 0,SESSION_ID,IMAGES,REVIEWS,FAQ,SPECS,SHIPPING,BOUGHT_TOGETHER,COMPARE_SIMILAR,VIEW_WARRANTY,SPONSORED_LINKS,PURCHASE
0,1001,0,1,0,1,0,1,1,0,1,0
1,1002,0,1,0,0,1,0,0,0,1,0
2,1003,1,1,0,1,1,0,0,0,1,0
3,1004,1,0,0,0,1,0,1,0,1,0
4,1005,1,1,0,1,1,1,1,0,0,0


In [3]:
#Summary statistics analysis of the data
prospect_data.describe()

Unnamed: 0,SESSION_ID,IMAGES,REVIEWS,FAQ,SPECS,SHIPPING,BOUGHT_TOGETHER,COMPARE_SIMILAR,VIEW_WARRANTY,SPONSORED_LINKS,PURCHASE
count,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0
mean,1336.5,0.470238,0.474702,0.470238,0.474702,0.471726,0.474702,0.473214,0.46875,0.474702,0.186012
std,194.133974,0.499485,0.499732,0.499485,0.499732,0.499572,0.499732,0.499654,0.499394,0.499732,0.389406
min,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1168.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1336.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1504.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
max,1672.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
#Perform correlation analysis

In [4]:
prospect_data.corr()['PURCHASE']

SESSION_ID        -0.005825
IMAGES             0.039998
REVIEWS            0.005071
FAQ               -0.036623
SPECS             -0.033221
SHIPPING          -0.022720
BOUGHT_TOGETHER    0.020388
COMPARE_SIMILAR    0.044795
VIEW_WARRANTY      0.026104
SPONSORED_LINKS    0.028047
PURCHASE           1.000000
Name: PURCHASE, dtype: float64

In [5]:
#Looking at the correlations above we can see that some features 
#like COMPARE_SIMILAR, IMAGES, SPONSORED_LINKS and VIEW_WARRANTY have medium correlation to the target variable.
#I will reduce the feature set to that list of variables.

In [6]:
#Drop columns with low correlation
predictors = prospect_data[['COMPARE_SIMILAR', 'IMAGES','SPONSORED_LINKS', 'VIEW_WARRANTY']]
targets = prospect_data.PURCHASE

In [7]:
#Training and Testing split
#Now I will split the model into training and testing data in the ratio of 70:30

pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.3)

print("Predictor - Training : ", pred_train.shape, "predictor - Testing : ", pred_test.shape)

Predictor - Training :  (470, 4) predictor - Testing :  (202, 4)


In [8]:
#Build Model and Check Accuracy

In [10]:
from sklearn.naive_bayes import GaussianNB

classifier=GaussianNB()
classifier=classifier.fit(pred_train,tar_train)

predictions=classifier.predict(pred_test)

#Analyze accuracy of predictions
sklearn.metrics.confusion_matrix(tar_test,predictions)

array([[163,   0],
       [ 39,   0]])

In [11]:
sklearn.metrics.accuracy_score(tar_test,predictions)

0.80693069306930698

In [None]:
#Instead of doing a Yes/No prediction,
#instead I'm going to do a probability computation to show the probablity for the prospect to buy the product

In [12]:
pred_prob=classifier.predict_proba(pred_test)
pred_prob[0,1]

0.17853742460223293

The Probability above can be read as 17% chance that the prospect will buy the product.

# Real time predictions

Now that the model has been built, I'm going to use it for real time predictions. So when the customer starts visiting the pages one by one, I will collect that list and then use it to compute the probability. We do that for every new click that comes in.

Lets go. The prospect just came to my website. There are no significant clicks. Let us compute the probability. The array of values passed has the values for COMPARE_SIMILAR, IMAGES, SPONSORED_LINKS and VIEW_WARRANTY. So the array is all zeros to begin with

In [15]:
browsing_data = np.array([0,0,0,0]).reshape(1,-1)
print("New visitor: propensity :",classifier.predict_proba(browsing_data)[:,1])

New visitor: propensity : [ 0.14833921]


So the initial probability is 15%. Now, suppose the customer clicks does a comparison of similar products. the array changes to include a 1 for that function. The new probability will be

In [16]:
browsing_data = np.array([1,0,0,0]).reshape(1,-1)
print("After checking similar products: propensity :",classifier.predict_proba(browsing_data)[:,1])

After checking similar products: propensity : [ 0.17291468]


Now he goes to click on Images.

In [17]:
browsing_data = np.array([1,1,0,0]).reshape(1,-1)
print("After checking images: propensity :",classifier.predict_proba(browsing_data)[:,1])

After checking images: propensity : [ 0.20690045]


It goes to 20%. You can have a threshold for when you want to offer chat. You can keep checking this probabiloty against that threshold to see if you want to popup a chat window.

This example shows you how you can use predictive analytics in realtime to decide whether a prospect has high propensity to convert and offer him a chat with a sales rep/agent.