# Relax Data Science Challenge

## Preparation

In [1]:
# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Create dataframe of users
df_users = pd.read_csv('takehome_users.csv', encoding = "ISO-8859-1", parse_dates=['creation_time'])

In [3]:
# Create data frame of engagement
df_engagement = pd.read_csv('takehome_user_engagement.csv', encoding = "ISO-8859-1", parse_dates=['time_stamp'])

In [4]:
# Show first five users
df_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
# Show first five engagements
df_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


## New Columns

In [6]:
# Create date columns for users
df_users['day_of_week'] = df_users['creation_time'].dt.dayofweek
df_users['week_of_year'] = df_users['creation_time'].dt.week
df_users['year'] = df_users['creation_time'].dt.year
df_users['hour'] = df_users['creation_time'].dt.hour
df_users['month'] = df_users['creation_time'].dt.month
df_users['day_of_year'] = df_users['creation_time'].dt.dayofyear

In [7]:
# Create date columns for engagement
df_engagement['day_of_week'] = df_engagement['time_stamp'].dt.dayofweek
df_engagement['week_of_year'] = df_engagement['time_stamp'].dt.week
df_engagement['year'] = df_engagement['time_stamp'].dt.year
df_engagement['day_of_year'] = df_engagement['time_stamp'].dt.dayofyear

In [8]:
# Show df_engagement statistics
df_engagement.describe()

Unnamed: 0,user_id,visited,day_of_week,week_of_year,year,day_of_year
count,207917.0,207917.0,207917.0,207917.0,207917.0,207917.0
mean,5913.314197,1.0,3.000313,23.908993,2013.377468,164.977332
std,3394.941674,0.0,2.001468,15.302984,0.590782,107.945743
min,1.0,1.0,0.0,1.0,2012.0,1.0
25%,3087.0,1.0,1.0,11.0,2013.0,76.0
50%,5682.0,1.0,3.0,20.0,2013.0,137.0
75%,8944.0,1.0,5.0,38.0,2014.0,264.0
max,12000.0,1.0,6.0,52.0,2014.0,366.0


In [9]:
# Show df_engagement info
df_engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 7 columns):
time_stamp      207917 non-null datetime64[ns]
user_id         207917 non-null int64
visited         207917 non-null int64
day_of_week     207917 non-null int64
week_of_year    207917 non-null int64
year            207917 non-null int64
day_of_year     207917 non-null int64
dtypes: datetime64[ns](1), int64(6)
memory usage: 11.1 MB


In [10]:
# Create total days column listing day the user is active
df_engagement['days'] = (df_engagement['year'] - 2012) * 365 + df_engagement['day_of_year'] 

In [11]:
# Repeat total days column for df_users listing day when account was created
df_users['days'] = (df_users['year'] - 2012) * 365 + df_users['day_of_year'] 

In [12]:
# Show first ten rows of df_engagement
df_engagement.head(10)

Unnamed: 0,time_stamp,user_id,visited,day_of_week,week_of_year,year,day_of_year,days
0,2014-04-22 03:53:30,1,1,1,17,2014,112,842
1,2013-11-15 03:45:04,2,1,4,46,2013,319,684
2,2013-11-29 03:45:04,2,1,4,48,2013,333,698
3,2013-12-09 03:45:04,2,1,0,50,2013,343,708
4,2013-12-25 03:45:04,2,1,2,52,2013,359,724
5,2013-12-31 03:45:04,2,1,1,1,2013,365,730
6,2014-01-08 03:45:04,2,1,2,2,2014,8,738
7,2014-02-03 03:45:04,2,1,0,6,2014,34,764
8,2014-02-08 03:45:04,2,1,5,6,2014,39,769
9,2014-02-09 03:45:04,2,1,6,6,2014,40,770


## Determine "Adopted Users"

In [13]:
# Create column that shows login two days later.
df_engagement['two_days_later'] = df_engagement.groupby('user_id').days.shift(2)

In [14]:
# Create column that shows difference between first and third logins
df_engagement['days_diff'] = df_engagement['days'] - df_engagement['two_days_later']

In [15]:
# Create dataframe of adopted users
df_engaged_users = df_engagement[df_engagement['days_diff'] <=7 ]

In [16]:
# Show dataframe of adopted users
df_engaged_users.head()

Unnamed: 0,time_stamp,user_id,visited,day_of_week,week_of_year,year,day_of_year,days,two_days_later,days_diff
9,2014-02-09 03:45:04,2,1,6,6,2014,40,770,764.0,6.0
10,2014-02-13 03:45:04,2,1,3,7,2014,44,774,769.0,5.0
11,2014-02-16 03:45:04,2,1,6,7,2014,47,777,770.0,7.0
24,2013-02-06 22:08:03,10,1,2,6,2013,37,402,395.0,7.0
27,2013-02-19 22:08:03,10,1,1,8,2013,50,415,410.0,5.0


In [17]:
# Create dataframe grouped by adopted user ids
df_engagement_3 = df_engaged_users.groupby(['user_id']).count()

In [18]:
# Show number of adopted users
len(df_engagement_3)

1656

In [19]:
# Show first five rows of grouped by dataframe
df_engagement_3.head()

Unnamed: 0_level_0,time_stamp,visited,day_of_week,week_of_year,year,day_of_year,days,two_days_later,days_diff
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,3,3,3,3,3,3,3,3,3
10,267,267,267,267,267,267,267,267,267
20,1,1,1,1,1,1,1,1,1
33,4,4,4,4,4,4,4,4,4
42,329,329,329,329,329,329,329,329,329


In [20]:
# Get correct user ids
user_id = df_engagement_3.index.unique(level='user_id')

In [21]:
# Write function to determine adopted user
def adopted_user(row):
    if row['object_id'] in user_id:
        return 1
    else:
        return 0

In [22]:
# Apply adopted_user function to df_users dataframe
df_users['adopted_user'] = df_users.apply(adopted_user, axis=1)

In [23]:
# Show df_users with new adopted user column
df_users.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,day_of_week,week_of_year,year,hour,month,day_of_year,days,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1,17,2014,3,4,112,842,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,4,46,2013,3,11,319,684,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,1,12,2013,23,3,78,443,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,1,21,2013,8,5,141,506,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,3,3,2013,10,1,17,382,0
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,1387424000.0,0,0,197,11241.0,1,51,2013,3,12,351,716,0
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,1356010000.0,0,1,37,,6,50,2012,13,12,351,351,0
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,,1,1,74,,2,31,2013,5,7,212,577,0
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,,0,0,302,,1,45,2013,4,11,309,674,0
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1401833000.0,1,1,318,4143.0,2,3,2013,22,1,16,381,1


## Correlations

In [24]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 18 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
day_of_week                   12000 non-null int64
week_of_year                  12000 non-null int64
year                          12000 non-null int64
hour                          12000 non-null int64
month                         12000 non-null int64
day_of_year                   12000 non-null int64
days                          12000 non-null int64
adopted_us

In [25]:
del df_users['last_session_creation_time']

In [26]:
df_users['invited_by_user_id'] = df_users['invited_by_user_id'].notnull().astype(int)

In [27]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 17 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            12000 non-null int64
day_of_week                   12000 non-null int64
week_of_year                  12000 non-null int64
year                          12000 non-null int64
hour                          12000 non-null int64
month                         12000 non-null int64
day_of_year                   12000 non-null int64
days                          12000 non-null int64
adopted_user                  12000 non-null int64
dtypes: date

In [28]:
corr_matrix = df_users.corr()
corr_matrix['adopted_user'].sort_values(ascending=False)

adopted_user                  1.000000
org_id                        0.066995
month                         0.038433
day_of_year                   0.037167
week_of_year                  0.036831
invited_by_user_id            0.027831
opted_in_to_mailing_list      0.008838
day_of_week                   0.006590
enabled_for_marketing_drip    0.006578
object_id                     0.005292
hour                          0.000086
year                         -0.085418
days                         -0.086246
Name: adopted_user, dtype: float64

The last session creation time is not relevant because adopted users are more likely to be active members. The year/days negative correlation at 8.5% indicates that users who signed up in the early part of this time period are more likely to be adopted users. This could be due to users who signed up later having fewer opportunities to become adopted users.

It's also fairly clear that it made litte difference for users who opted into mailing list, were enabled for marketing drip, and were invited by a user_id.

The scatter plot above is not very convincing, but it does appear that the most recent users are more likely not to have adopted yet. This could be due to fewer opportunities.

In [29]:
# Show distribution of creation_source for all users
df_users['creation_source'].value_counts(normalize=True)

ORG_INVITE            0.354500
GUEST_INVITE          0.180250
PERSONAL_PROJECTS     0.175917
SIGNUP                0.173917
SIGNUP_GOOGLE_AUTH    0.115417
Name: creation_source, dtype: float64

In [30]:
# Create dataframe of adopted users only
df_adopted_users = df_users[df_users['adopted_user']==1]

In [31]:
# Show distribution of creation source for adopted users only
df_adopted_users['creation_source'].value_counts(normalize=True)

ORG_INVITE            0.346618
GUEST_INVITE          0.222826
SIGNUP                0.182367
SIGNUP_GOOGLE_AUTH    0.144324
PERSONAL_PROJECTS     0.103865
Name: creation_source, dtype: float64

Guest invites are more likey to become adopted users. Personal projects are less likely to become adopted users. Signup with Google Authorization is a little more likely to become an adopted user.

## Machine Learning

In [32]:
df_users.head(10)

Unnamed: 0,object_id,creation_time,name,email,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,day_of_week,week_of_year,year,hour,month,day_of_year,days,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1,0,11,1,1,17,2014,3,4,112,842,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,0,0,1,1,4,46,2013,3,11,319,684,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,0,0,94,1,1,12,2013,23,3,78,443,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,0,0,1,1,1,21,2013,8,5,141,506,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,0,0,193,1,3,3,2013,10,1,17,382,0
5,6,2013-12-17 03:37:06,Cunha Eduardo,EduardoPereiraCunha@yahoo.com,GUEST_INVITE,0,0,197,1,1,51,2013,3,12,351,716,0
6,7,2012-12-16 13:24:32,Sewell Tyler,TylerSewell@jourrapide.com,SIGNUP,0,1,37,0,6,50,2012,13,12,351,351,0
7,8,2013-07-31 05:34:02,Hamilton Danielle,DanielleHamilton@yahoo.com,PERSONAL_PROJECTS,1,1,74,0,2,31,2013,5,7,212,577,0
8,9,2013-11-05 04:04:24,Amsel Paul,PaulAmsel@hotmail.com,PERSONAL_PROJECTS,0,0,302,0,1,45,2013,4,11,309,674,0
9,10,2013-01-16 22:08:03,Santos Carla,CarlaFerreiraSantos@gustr.com,ORG_INVITE,1,1,318,1,2,3,2013,22,1,16,381,1


In [33]:
# Convert creation_source into numerical column
df_users['creation_source_factored'], creation_categories = df_users['creation_source'].factorize()

In [34]:
# Choose predictor variables
X = df_users[['creation_source_factored','opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'invited_by_user_id', 'org_id']]

In [35]:
# Create target variable
y = df_users['adopted_user']

In [36]:
# Import machine learning modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

  from numpy.core.umath_tests import inner1d


In [37]:
# Create function to return the results of machine learning tests
def ml_classification_tests(X, y, num_cols=X.shape[1]):
    
    # Split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    #------------------------------------------------------------------------
    
    # NAIVE BAYES
    print('Naive Bayes')
    
    #the grid of parameters to search over
    alphas = [0.001, 0.01, .1, 1, 5]
    
    param_grid = {'alpha': alphas}
    
    # Create a multinomial classifier
    mnb = MultinomialNB()
    
    mnb_cv = GridSearchCV(mnb, param_grid, cv=5)
    
    # Fit the classifier to the data
    mnb_cv.fit(X_train, y_train)
    
    # Print the tuned parameters and score
    print("Best Naive Bayes alpha: {}".format(mnb_cv.best_params_)) 
    print("Best Naive Bayes score: {}".format(mnb_cv.best_score_))
    
    # Predict the labels of the test set: y_pred
    y_pred = mnb_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    #------------------------------------------------------------------------

    
    # LOGISTIC REGRESSION
    print('\nLogistic Regression')
    
    # Setup the hyperparameter grid
    c_space = np.logspace(-5, 8, 10)
    param_grid = {'C': c_space}

    # Instantiate a logistic regression classifier: logreg
    logreg = LogisticRegression()

    # Instantiate the GridSearchCV object: logreg_cv
    logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

    # Fit it to the data
    logreg_cv.fit(X_train,y_train)

    # Print the tuned parameters and score
    print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
    print("Best Logistic Regression score: {}".format(logreg_cv.best_score_))
    
        # Predict the labels of the test set: y_pred
    y_pred = logreg_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    
    #------------------------------------------------------------------------
    
    # DECISION TREE TUNED
    print('\nDecision Tree')
    
    # Instantiate a Decision Tree classifier
    tree = DecisionTreeClassifier()
        
    if num_cols == 1:
        # Setup the parameters and distributions to sample from: param_dist
        param_dist = {"max_depth": [3, None],
              "criterion": ["gini", "entropy"]}

        # Instantiate RandomizedSearchCV
        tree_cv = GridSearchCV(tree, param_dist, cv=5)
    
    else:
        # Setup the parameters and distributions to sample from: param_dist
        param_dist = {"max_depth": [3, None],
                  "max_features": randint(1, num_cols),
                  "min_samples_leaf": randint(1, num_cols),
                  "criterion": ["gini", "entropy"]}

        # Instantiate RandomizedSearchCV
        tree_cv = RandomizedSearchCV(tree, param_dist, n_iter = 20, cv=5)

    # Fit it to the data
    tree_cv.fit(X,y)

    # Print the tuned parameters and score
    print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
    print("Best Tuned Decision Tree score: {}".format(tree_cv.best_score_))
    
    # Predict the labels of the test set: y_pred
    y_pred = tree_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    
    #------------------------------------------------------------------------
    
    # RANDOM FORESTS
    print('\nRandom Forests')
    
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}    
    
    # Instantiate a Random Forest Classifier
    rfc = RandomForestClassifier()
    
    # Instantiate RandomizedSearchCV
    rf_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, cv = 5, n_jobs = -1)
    
    # Fit the random search model
    rf_random.fit(X_train, y_train)
    
    # Print the tuned parameters and score
    print("Tuned Random Forest Parameters: {}".format(rf_random.best_params_))
    print("Best Random Forest score: {}".format(rf_random.best_score_))
    
    # Predict the labels of the test set: y_pred
    y_pred = mnb_cv.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    

In [38]:
# Check predictions
ml_classification_tests(X,y,5)

  'precision', 'predicted', average, warn_for)


Naive Bayes
Best Naive Bayes alpha: {'alpha': 0.001}
Best Naive Bayes score: 0.863
Confusion Matrix: [[2577    0]
 [ 423    0]]
Classification Report:              precision    recall  f1-score   support

          0       0.86      1.00      0.92      2577
          1       0.00      0.00      0.00       423

avg / total       0.74      0.86      0.79      3000


Logistic Regression
Tuned Logistic Regression Parameters: {'C': 1e-05}
Best Logistic Regression score: 0.863
Confusion Matrix: [[2577    0]
 [ 423    0]]
Classification Report:              precision    recall  f1-score   support

          0       0.86      1.00      0.92      2577
          1       0.00      0.00      0.00       423

avg / total       0.74      0.86      0.79      3000


Decision Tree


  'precision', 'predicted', average, warn_for)


Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 2}
Best Tuned Decision Tree score: 0.862
Confusion Matrix: [[2577    0]
 [ 423    0]]
Classification Report:              precision    recall  f1-score   support

          0       0.86      1.00      0.92      2577
          1       0.00      0.00      0.00       423

avg / total       0.74      0.86      0.79      3000


Random Forests


  'precision', 'predicted', average, warn_for)


Tuned Random Forest Parameters: {'n_estimators': 1000, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': True}
Best Random Forest score: 0.8425555555555555
Confusion Matrix: [[2577    0]
 [ 423    0]]
Classification Report:              precision    recall  f1-score   support

          0       0.86      1.00      0.92      2577
          1       0.00      0.00      0.00       423

avg / total       0.74      0.86      0.79      3000



  'precision', 'predicted', average, warn_for)


In [39]:
# Check percentage of adopted users
len(df_adopted_users)/len(df_users)

0.138

The machine learning algorithms are all predicting a result of 0. This is likely due to the class imbalance. This can be corrected.

## Resampling

In [40]:
import imblearn
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 10344), (1, 10344)]


In [41]:
ml_classification_tests(X_resampled, y_resampled)

Naive Bayes
Best Naive Bayes alpha: {'alpha': 5}
Best Naive Bayes score: 0.5440190770817221
Confusion Matrix: [[1518 1076]
 [1237 1341]]
Classification Report:              precision    recall  f1-score   support

          0       0.55      0.59      0.57      2594
          1       0.55      0.52      0.54      2578

avg / total       0.55      0.55      0.55      5172


Logistic Regression
Tuned Logistic Regression Parameters: {'C': 0.0002782559402207126}
Best Logistic Regression score: 0.5489172467130704
Confusion Matrix: [[1396 1198]
 [1042 1536]]
Classification Report:              precision    recall  f1-score   support

          0       0.57      0.54      0.55      2594
          1       0.56      0.60      0.58      2578

avg / total       0.57      0.57      0.57      5172


Decision Tree
Tuned Decision Tree Parameters: {'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1}
Best Tuned Decision Tree score: 0.7773105181747874
Confusion Matrix: [[19

The Decision Tree is clearly outperforming the others. This is a viable ML model to utilize to predict future users.