In [1]:
import pandas as pd 
from matplotlib import pyplot as plt 
import seaborn as sns
sns.set()

import numpy as np 
import matplotlib 

np.random.seed(42)

# QUESTION: What type of users are more likely to go premium?

In [2]:
df = pd.read_csv('data/Presentations_under5.csv')

### This dataset contains aggregations of amount of presentations, views, and slides of a random sample of users

### We chose to only include the presentations of premium users BEFORE they converted to premium in our aggregations in order to decrease the chance of bias/overfitting in our model

In [3]:
df.head()

Unnamed: 0,UserId,total_pre,average_views,average_slides,UserId.1,PremiumStartDate,RegisterDate
0,3fbd6ca8-5e0e-4210-a7be-64a480f20873,5,0,2.0,3fbd6ca8-5e0e-4210-a7be-64a480f20873,2019-06-21 13:59:02.217,2019-06-21 13:35:42.970
1,b2e1328d-e603-43cc-8564-bc74bcb8261d,1,0,10.0,b2e1328d-e603-43cc-8564-bc74bcb8261d,2016-07-27 14:15:03.963,2016-07-27 14:04:14.880
2,80a54a74-f071-49bb-b599-cfb040621c2f,2,81,9.0,80a54a74-f071-49bb-b599-cfb040621c2f,2016-05-17 20:17:36.207,2015-05-15 03:32:37.687
3,a187dfa7-e7de-4d52-967c-7f5ff5bc2fb0,3,9,4.0,a187dfa7-e7de-4d52-967c-7f5ff5bc2fb0,2017-04-29 06:50:14.073,2016-03-31 12:31:56.610
4,b8cdc8ad-d443-427d-b4c0-052023da593f,3,9,6.0,b8cdc8ad-d443-427d-b4c0-052023da593f,2018-03-01 03:13:36.323,2016-09-27 01:03:31.210


In [4]:
#dropping the repeat UserId column
df = df.drop('UserId.1',axis = 1)

In [5]:
df.shape

(66458, 6)

In [6]:
#randomize the dataset
df = df.sample(frac=1).reset_index(drop = True)

In [7]:
#replace PremiumStartDate with 1 if not null, 0 if null, to prepare for the ML model
df.PremiumStartDate = df.PremiumStartDate.fillna(0)
for idx, row in df.iterrows():
    if  df.loc[idx,'PremiumStartDate'] != 0:
        df.loc[idx,'PremiumStartDate'] = 1

In [8]:
#drop non numerical columns
df_tree = df.drop(['UserId','RegisterDate'],axis=1)

In [9]:
#dataset for decision tree model
df_tree.head()

Unnamed: 0,total_pre,average_views,average_slides,PremiumStartDate
0,1,1,10.0,1
1,4,54,4.0,0
2,2,17,7.0,0
3,1,0,1.0,0
4,2,31,11.0,0


In [10]:
#dropping all null values
df_tree = df_tree.dropna()

In [11]:
#identifying target variable and independent variables
target_variable = 'PremiumStartDate'
independent_variables = df_tree.drop(columns = target_variable).columns

In [48]:
#using a decision tree classifier with a max depth of 5
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier(max_depth = 5)

In [49]:
#fitting the model
tree.fit(df_tree[independent_variables], df_tree.PremiumStartDate)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [50]:
#making predictions
tree.predict(df_tree[independent_variables])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [51]:
#cross validating the accuracy, shows no bias or overfitting
cross_val_score(tree, df_tree[independent_variables],
                df_tree.PremiumStartDate, scoring="accuracy", 
                cv=3).mean()

0.7769246981603589

In [52]:
#cross validating the roc auc, shows no bias or overfitting
cross_val_score(tree, df_tree[independent_variables],
                df_tree.PremiumStartDate, scoring="roc_auc", 
                cv=3).mean()

0.7825400508452868

In [53]:
import graphviz
from sklearn.tree import export_graphviz

def draw_tree(tree):
    dot_data = export_graphviz(tree, out_file=None, 
                         feature_names=independent_variables, 
                               class_names=['Premium', 'NotPremium'],
                         filled=True, 
                         #impurity=True,
                         rounded=True,  
                         special_characters=True,
                              proportion = True)  #trying changing proportion = False
    
    graph = graphviz.Source(dot_data)
    graph.format = 'png'
    graph.render('tree',view=True)

In [54]:
#uncomment below to see decision tree drawing
#draw_tree(tree)

Decision trees have a method that identifies which features are most important in making predictions, all features are out of 1.00

In [55]:
dict(zip(
    independent_variables,
    tree.feature_importances_
))

{'total_pre': 0.016527169546132245,
 'average_views': 0.2126578817356437,
 'average_slides': 0.770814948718224}

As you can see, users with a high number of slides in their presentations have a higher chance of becoming premium. (0.77)