# Deep Learning for Convertion Prediction

Goal: How likely it is for an existing customer to convert again

## Importing Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
from torch.utils.data import DataLoader
from torch import nn, optim
import torch.nn.functional as F

import pickle

seed=101096

## Data Preparation

In [2]:
column_names = ['ID',
                'Book length (mins)_overall',
                'Book length (mins)_avg',
                'Price_overall',
                'Price_avg',
                'Review',
                'Review 10/10',
                'Minutes listened',
                'Completion',
                'Support Requests',
                'Las visited minus Purchase date',
                'Targets']

df = pd.read_csv('Data/audiobooks_data.csv', names=column_names, header=None)
df.head()

Unnamed: 0,ID,Book length (mins)_overall,Book length (mins)_avg,Price_overall,Price_avg,Review,Review 10/10,Minutes listened,Completion,Support Requests,Las visited minus Purchase date,Targets
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14084 entries, 0 to 14083
Data columns (total 12 columns):
ID                                 14084 non-null int64
Book length (mins)_overall         14084 non-null float64
Book length (mins)_avg             14084 non-null int64
Price_overall                      14084 non-null float64
Price_avg                          14084 non-null float64
Review                             14084 non-null int64
Review 10/10                       14084 non-null float64
Minutes listened                   14084 non-null float64
Completion                         14084 non-null float64
Support Requests                   14084 non-null int64
Las visited minus Purchase date    14084 non-null int64
Targets                            14084 non-null int64
dtypes: float64(6), int64(6)
memory usage: 1.3 MB


We need to match the number of 1s to the number of 0s in order to balance the data

In [4]:
one_count = df[df['Targets']==1].shape[0]
zero_count = df[df['Targets']==0].shape[0]
print(one_count, zero_count)

2237 11847


In [5]:
zero_target = df[df['Targets']==0].sample(one_count, random_state=seed)
one_target = df[df['Targets']==1]
new_df = pd.concat([zero_target, one_target], axis=0)
new_df = shuffle(new_df, random_state=seed).reset_index(drop=True)

In [7]:
new_one_count = new_df[new_df['Targets']==1].shape[0]
new_zero_count = new_df[new_df['Targets']==0].shape[0]
print(new_one_count, new_zero_count)

2237 2237


Now we have balanced data

In [8]:
x_columns = ['Book length (mins)_overall',
             'Book length (mins)_avg',
             'Price_overall',
             'Price_avg',
             'Review',
             'Review 10/10',
             'Minutes listened',
             'Completion',
             'Support Requests',
             'Las visited minus Purchase date']
X = new_df[x_columns]
y = new_df['Targets']

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=.2, random_state=seed)

In [10]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_valid_scaler = scaler.transform(X_valid)

In [11]:
train = [*zip(X_train_scaler.astype(np.float32), y_train.values)]
valid = [*zip(X_valid_scaler.astype(np.float32), y_valid.values)]

In [34]:
batch_size=16

In [35]:
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size)

## Model

In [36]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(10, 50)
        self.fc2 = nn.Linear(50, 100)
        self.fc3 = nn.Linear(100, 50)
        self.fc4 = nn.Linear(50, 2)
        
        self.dropout = nn.Dropout(p=0.2)
        
    def forward(self, x):
        # make sure input tensor is flattened      
        x = self.dropout(F.elu(self.fc1(x)))
        x = self.dropout(F.elu(self.fc2(x)))
        x = self.dropout(F.elu(self.fc3(x)))
        x = F.elu(self.fc4(x))
        
        return x

In [37]:
model = Classifier()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs=50

## Train and Validation

In [38]:
train_losses, valid_losses = [], []
for e in range(epochs):
    tot_train_loss = 0
    for inputs, labels in train_loader:
        
        optimizer.zero_grad()
        
        log_ps = model(inputs)
        loss = criterion(log_ps, labels)
        tot_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    else:
        tot_valid_loss = 0
        valid_correct = 0  # Number of correct predictions on the valid set
        
        # Turn off gradients for validation, saves memory and computations
        with torch.no_grad():
            for inputs, labels in valid_loader:
                log_ps = model(inputs)
                loss = criterion(log_ps, labels)
                tot_valid_loss += loss.item()

                ps = torch.exp(log_ps)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == labels.view(*top_class.shape)
                valid_correct += equals.sum().item()

        # Get mean loss to enable comparison between train and valid sets
        train_loss = tot_train_loss / len(train_loader.dataset)
        valid_loss = tot_valid_loss / len(valid_loader.dataset)

        # At completion of epoch
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print("Epoch: {}/{}.. ".format(e+1, epochs),
              "Training Loss: {:.3f}.. ".format(train_loss),
              "Valid Loss: {:.3f}.. ".format(valid_loss),
              "Valid Accuracy: {:.3f}".format(valid_correct / len(valid_loader.dataset)))

Epoch: 1/50..  Training Loss: 0.030..  Valid Loss: 0.027..  Valid Accuracy: 0.753
Epoch: 2/50..  Training Loss: 0.026..  Valid Loss: 0.026..  Valid Accuracy: 0.753
Epoch: 3/50..  Training Loss: 0.026..  Valid Loss: 0.026..  Valid Accuracy: 0.745
Epoch: 4/50..  Training Loss: 0.025..  Valid Loss: 0.025..  Valid Accuracy: 0.784
Epoch: 5/50..  Training Loss: 0.025..  Valid Loss: 0.025..  Valid Accuracy: 0.775
Epoch: 6/50..  Training Loss: 0.024..  Valid Loss: 0.025..  Valid Accuracy: 0.778
Epoch: 7/50..  Training Loss: 0.024..  Valid Loss: 0.025..  Valid Accuracy: 0.777
Epoch: 8/50..  Training Loss: 0.024..  Valid Loss: 0.024..  Valid Accuracy: 0.800
Epoch: 9/50..  Training Loss: 0.023..  Valid Loss: 0.024..  Valid Accuracy: 0.800
Epoch: 10/50..  Training Loss: 0.023..  Valid Loss: 0.024..  Valid Accuracy: 0.803
Epoch: 11/50..  Training Loss: 0.023..  Valid Loss: 0.023..  Valid Accuracy: 0.792
Epoch: 12/50..  Training Loss: 0.023..  Valid Loss: 0.024..  Valid Accuracy: 0.807
Epoch: 13/50.