# Pre-Processing

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Load and create dataframe to pre-process

In [2]:
df = pd.read_csv('drug_consumption_pp.csv', index_col=0)

In [3]:
data = df.drop(columns=['ID'])

### Create dummy variables to categorical features

In [4]:
idx_alcohol = data.columns.get_loc('Alcohol')
idx_vsa = data.columns.get_loc('VSA')

drugs = list(data.iloc[:,idx_alcohol:idx_vsa+1].columns)
print(drugs)

['Alcohol', 'Amphetamines', 'Amyl', 'Benzos', 'Caffeine', 'Cannabis', 'Chocolate', 'Cocaine', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legal Highs', 'LSD', 'Methadone', 'Mushrooms', 'Nicotine', 'VSA']


In [5]:
data['Gender'] = data['Gender'].replace(('Female','Male'), (0,1))
data[drugs] = data[drugs].replace(('Non-user','User'), (0,1))

pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0,Age,Gender,Education,Country,Neuroticism,Extraversion,Openness_to_Experience,Agreeableness,Conscientiousness,Impulsiveness,Sensation-Seeking,Alcohol,Amphetamines,Amyl,Benzos,Caffeine,Cannabis,Chocolate,Cocaine,Crack,Ecstasy,Heroin,Ketamine,Legal Highs,LSD,Methadone,Mushrooms,Nicotine,VSA
0,35-44,0,Professional certificate/diploma,UK,39.0,36.0,42.0,37.0,42.0,-0.21712,-1.18084,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0
1,25-34,1,Doctorate,UK,29.0,52.0,55.0,48.0,41.0,-0.71126,-0.21575,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0,1,0
2,35-44,1,Professional certificate/diploma,UK,31.0,45.0,40.0,32.0,34.0,-1.37983,0.40148,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
3,18-24,0,Masters,UK,34.0,34.0,46.0,47.0,46.0,-1.37983,-1.18084,1,0,0,1,1,1,1,1,0,0,0,1,0,0,0,0,1,0
4,35-44,0,Doctorate,UK,43.0,28.0,43.0,41.0,50.0,-0.21712,-0.21575,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0


In [6]:
dfo = data.select_dtypes(include='object')
data = pd.concat([data.drop(dfo, axis=1), pd.get_dummies(dfo, drop_first=True)], axis=1)
data.head()

Unnamed: 0,Gender,Neuroticism,Extraversion,Openness_to_Experience,Agreeableness,Conscientiousness,Impulsiveness,Sensation-Seeking,Alcohol,Amphetamines,Amyl,Benzos,Caffeine,Cannabis,Chocolate,Cocaine,Crack,Ecstasy,Heroin,Ketamine,Legal Highs,LSD,Methadone,Mushrooms,Nicotine,VSA,Age_25-34,Age_35-44,Age_45-54,Age_55+,Education_Doctorate,Education_Left school at 17 or younger,Education_Left school at 18,Education_Masters,Education_Professional certificate/diploma,Education_Some college/university,Country_Other,Country_UK,Country_USA
0,0,39.0,36.0,42.0,37.0,42.0,-0.21712,-1.18084,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0
1,1,29.0,52.0,55.0,48.0,41.0,-0.71126,-0.21575,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0
2,1,31.0,45.0,40.0,32.0,34.0,-1.37983,0.40148,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
3,0,34.0,34.0,46.0,47.0,46.0,-1.37983,-1.18084,1,0,0,1,1,1,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,0,43.0,28.0,43.0,41.0,50.0,-0.21712,-0.21575,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0


### Standardize and split data into test/training sets

In [7]:
X = data.drop(columns=['Nicotine'])
y = data.Nicotine

SS = StandardScaler()
X_scaled = SS.fit_transform(X)
y = y.ravel()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=1)

# Modeling

The aim of this project is to predict whether a someone is a nicotine user based on different personality traits and other drug usage. As this is a classification problem six different models will be explored: <br>
- K-nearest Neighbors (KNN)
- Logistic Regression
- Support Vector Machines (SVM)
- Random Forests
- Naive Bayes
- Gradient Boost <br>

The models will be evaluated by cross validation initially to determine the best two models. After distinguishing the models with the best performance, the best hyperparameters will be evaluated through grid searching to optimize their performance.

In [55]:
#Import models and metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

### KNN

In [15]:
#Determine good n_neighbors for knn through quick for loop
for n in range(5,101,5):
    knn = KNeighborsClassifier(weights='distance', n_neighbors=n)
    knn.fit(X_train, y_train)
    
    y_pred_knn = knn.predict(X_test)
    acc_knn = accuracy_score(y_test, y_pred_knn)
    
    print('Number of neighbors =', n, '  Accuracy_score =', acc_knn)

Number of neighbors = 5   Accuracy_score = 0.7245762711864406
Number of neighbors = 10   Accuracy_score = 0.7245762711864406
Number of neighbors = 15   Accuracy_score = 0.7542372881355932
Number of neighbors = 20   Accuracy_score = 0.75
Number of neighbors = 25   Accuracy_score = 0.7584745762711864
Number of neighbors = 30   Accuracy_score = 0.7627118644067796
Number of neighbors = 35   Accuracy_score = 0.760593220338983
Number of neighbors = 40   Accuracy_score = 0.7648305084745762
Number of neighbors = 45   Accuracy_score = 0.7542372881355932
Number of neighbors = 50   Accuracy_score = 0.7563559322033898
Number of neighbors = 55   Accuracy_score = 0.7584745762711864
Number of neighbors = 60   Accuracy_score = 0.760593220338983
Number of neighbors = 65   Accuracy_score = 0.7711864406779662
Number of neighbors = 70   Accuracy_score = 0.7690677966101694
Number of neighbors = 75   Accuracy_score = 0.760593220338983
Number of neighbors = 80   Accuracy_score = 0.7627118644067796
Number of 

In [17]:
#Selected 65 number of neighbors to proceed with cross validation
knn = KNeighborsClassifier(weights='distance', n_neighbors=65)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.7711864406779662

In [35]:
#Cross validation function to check mean score and std for each classifier
def cross_validate(classifier, X_test, y_test, X_train, y_train, scoring):
    cv_scores_train= cross_val_score(classifier,X_train,y_train,cv=5,scoring=scoring)
    print('Cross validation scores for training set:', cv_scores_train)
    cv_scores_test= cross_val_score(classifier,X_test,y_test,cv=5,scoring=scoring)
    print('Cross validation scores for test set:', cv_scores_test, '\n')

    cv_scores_knn_test= cv_scores_test.mean()
    cv_scores_knn_train= cv_scores_train.mean()
    
    print ('Mean cv train score: ' +str(cv_scores_knn_train))
    print ('Std cv train score: ' +str(cv_scores_train.std()), '\n')
    
    print ('Mean cv test score: ' +str(cv_scores_knn_test))
    print ('Std cv test score: ' +str(cv_scores_test.std()), '\n')

In [36]:
cross_validate(knn, X_test, y_test, X_train, y_train, 'roc_auc')

Cross validation scores for training set: [0.80254419 0.83005093 0.82897566 0.82727428 0.82579507]
Cross validation scores for test set: [0.75554435 0.82610887 0.78136201 0.86584741 0.8499744 ] 

Mean cv train score: 0.8229280272543888
Std cv train score: 0.010294858700291381 

Mean cv test score: 0.8157674091141833
Std cv test score: 0.041462317370836575 



In [56]:
def cfn_mtx(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [57]:
y_pred_knn = knn.predict(X_test)
cfn_mtx(y_test, y_pred_knn)

[[108  47]
 [ 61 256]]
              precision    recall  f1-score   support

           0       0.64      0.70      0.67       155
           1       0.84      0.81      0.83       317

    accuracy                           0.77       472
   macro avg       0.74      0.75      0.75       472
weighted avg       0.78      0.77      0.77       472



### Logistic Regression

In [37]:
#Determine good inverse regularization strength for log reg through quick for loop
C = [0.001, 0.01, 0.1, 1, 10]
for c in C:
    log_r = LogisticRegression(C=c, random_state=5)
    log_r.fit(X_train, y_train)
    
    y_pred_log_r = log_r.predict(X_test)
    acc_log_r = accuracy_score(y_test, y_pred_log_r)
    
    print('C =', c, '  Accuracy_score =', acc_log_r)

C = 0.001   Accuracy_score = 0.7796610169491526
C = 0.01   Accuracy_score = 0.7902542372881356
C = 0.1   Accuracy_score = 0.788135593220339
C = 1   Accuracy_score = 0.7838983050847458
C = 10   Accuracy_score = 0.7838983050847458


In [38]:
#Selected 0.01 inv regularization strength to proceed with cross validation
log_r = LogisticRegression(C=0.01, random_state=5)
log_r.fit(X_train, y_train)
log_r.score(X_test, y_test)

0.7902542372881356

In [39]:
cross_validate(log_r, X_test, y_test, X_train, y_train, 'roc_auc')

Cross validation scores for training set: [0.8153214  0.8466893  0.82637238 0.84337486 0.84087159]
Cross validation scores for test set: [0.74747984 0.82762097 0.80798771 0.86379928 0.84280594] 

Mean cv train score: 0.8345259090684081
Std cv train score: 0.011847472735327559 

Mean cv test score: 0.817938748079877
Std cv test score: 0.03969582424473494 



In [58]:
y_pred_lr = log_r.predict(X_test)
cfn_mtx(y_test, y_pred_lr)

[[100  55]
 [ 44 273]]
              precision    recall  f1-score   support

           0       0.69      0.65      0.67       155
           1       0.83      0.86      0.85       317

    accuracy                           0.79       472
   macro avg       0.76      0.75      0.76       472
weighted avg       0.79      0.79      0.79       472



### SVM

In [40]:
#Determine good inverse regularization strength for SVC through quick for loop
C = [0.001, 0.01, 0.1, 1, 10]
for c in C:
    svc = SVC(C=c, kernel='linear')
    svc.fit(X_train, y_train)
    
    y_pred_svc = svc.predict(X_test)
    acc_svc = accuracy_score(y_test, y_pred_svc)
    
    print('C =', c, '  Accuracy_score =', acc_svc)

C = 0.001   Accuracy_score = 0.7923728813559322
C = 0.01   Accuracy_score = 0.7966101694915254
C = 0.1   Accuracy_score = 0.7966101694915254
C = 1   Accuracy_score = 0.7966101694915254
C = 10   Accuracy_score = 0.7966101694915254


In [41]:
#Use default inv regularization strength of 1
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.7966101694915254

In [42]:
cross_validate(svc, X_test, y_test, X_train, y_train, 'roc_auc')

Cross validation scores for training set: [0.76241135 0.76378042 0.80837578 0.81828526 0.77675371]
Cross validation scores for test set: [0.67137097 0.81350806 0.81259601 0.85304659 0.79467486] 

Mean cv train score: 0.785921303168952
Std cv train score: 0.02314567170202692 

Mean cv test score: 0.7890392985151049
Std cv test score: 0.061851707488203296 



In [59]:
y_pred_svc = svc.predict(X_test)
cfn_mtx(y_test, y_pred_svc)

[[107  48]
 [ 48 269]]
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       155
           1       0.85      0.85      0.85       317

    accuracy                           0.80       472
   macro avg       0.77      0.77      0.77       472
weighted avg       0.80      0.80      0.80       472



### Random Forests

In [45]:
#Determine good inverse regularization strength for SVC through quick for loop
criterion = ['gini','entropy']
for c in criterion:
    rf = RandomForestClassifier(criterion=c, bootstrap=True)
    rf.fit(X_train, y_train)
    
    y_pred_rf = rf.predict(X_test)
    acc_rf = accuracy_score(y_test, y_pred_rf)
    
    print('Criterion =', c, '  Accuracy_score =', acc_svc)

Criterion = gini   Accuracy_score = 0.7966101694915254
Criterion = entropy   Accuracy_score = 0.7966101694915254


In [46]:
#Use default gini criterion, both critertion performed the same
rf = RandomForestClassifier(random_state=5)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7838983050847458

In [47]:
cross_validate(rf, X_test, y_test, X_train, y_train, 'roc_auc')

Cross validation scores for training set: [0.8020376  0.83203169 0.83347482 0.84035956 0.83654776]
Cross validation scores for test set: [0.75100806 0.80166331 0.81822837 0.85125448 0.79595494] 

Mean cv train score: 0.8288902860369104
Std cv train score: 0.013726866289003525 

Mean cv test score: 0.803621831797235
Std cv test score: 0.032597685817477864 



In [60]:
y_pred_rf = rf.predict(X_test)
cfn_mtx(y_test, y_pred_rf)

[[ 94  61]
 [ 41 276]]
              precision    recall  f1-score   support

           0       0.70      0.61      0.65       155
           1       0.82      0.87      0.84       317

    accuracy                           0.78       472
   macro avg       0.76      0.74      0.75       472
weighted avg       0.78      0.78      0.78       472



### Naive Bayes

In [48]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.722457627118644

In [49]:
cross_validate(nb, X_test, y_test, X_train, y_train, 'roc_auc')

Cross validation scores for training set: [0.78197118 0.83814375 0.80268817 0.81919554 0.8255675 ]
Cross validation scores for test set: [0.72883065 0.78326613 0.81464414 0.85995904 0.82539683] 

Mean cv train score: 0.8135132284051597
Std cv train score: 0.019474800239742577 

Mean cv test score: 0.8024193548387097
Std cv test score: 0.04420490645886218 



In [61]:
y_pred_nb = nb.predict(X_test)
cfn_mtx(y_test, y_pred_nb)

[[121  34]
 [ 97 220]]
              precision    recall  f1-score   support

           0       0.56      0.78      0.65       155
           1       0.87      0.69      0.77       317

    accuracy                           0.72       472
   macro avg       0.71      0.74      0.71       472
weighted avg       0.76      0.72      0.73       472



### Gradient Boosting

In [50]:
gb = GradientBoostingClassifier(subsample=0.8, learning_rate=0.05 , n_estimators=160, random_state=5, max_depth=9, max_leaf_nodes=100)
gb.fit(X_train, y_train)
gb.score(X_test, y_test)

0.7733050847457628

In [51]:
cross_validate(gb, X_test, y_test, X_train, y_train, 'roc_auc')

Cross validation scores for training set: [0.81115614 0.82484437 0.83978495 0.82477101 0.8156682 ]
Cross validation scores for test set: [0.75504032 0.8109879  0.84178187 0.83256528 0.78545827] 

Mean cv train score: 0.8232449332997203
Std cv train score: 0.009818399781713859 

Mean cv test score: 0.805166730670763
Std cv test score: 0.031705885083458916 



In [62]:
y_pred_gb = gb.predict(X_test)
cfn_mtx(y_test, y_pred_gb)

[[ 90  65]
 [ 42 275]]
              precision    recall  f1-score   support

           0       0.68      0.58      0.63       155
           1       0.81      0.87      0.84       317

    accuracy                           0.77       472
   macro avg       0.75      0.72      0.73       472
weighted avg       0.77      0.77      0.77       472

