In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
from dateutil.parser import parse

import math
import os
import copy
import pickle

import drugLookup

In [4]:
# Read in data (from pickle file)
file = open('train_set_filtered','rb')
train_set = pickle.load(file)

file = open('test_set_filtered','rb')
test_set = pickle.load(file)

In [5]:
d_train = list(train_set['drug_prediction'])
v_train = list(train_set['vendor_name'])

d_test = list(test_set['drug_prediction'])
v_test = list(test_set['vendor_name'])

In [6]:
# Do time binning: i.e. classify each transaction into given time interval
from datetime import datetime, date, time, timedelta

start = time(0,0,0,0) # first interval start date
interval = 5 # length of interval (minutes)

def addTimes(start_time, duration):
    # Function to add minutes to datetime.time() object
    dt = datetime.combine(date.today(), start_time) # create arbitrary date
    return((dt + timedelta(minutes = duration)).time())

#  Function to check if time is in a time interval
def timeInInterval(start_time, duration, check_time):
    end_time = addTimes(start_time, duration)
    cond1 = start_time <= check_time
    cond2 = check_time < end_time
    return(cond1 and cond2)

# Function to classify given time into one of given bins
def binTime(time1, start_times, duration = interval):
    time_bin = [start_time for start_time in start_times if timeInInterval(start_time, duration, time1.time())]
    if len(time_bin) == 0:
        return(start_times[-1])
    else:
        return(time_bin[0])
    
# Create list of time intervals
# Get list of interval start dates
n_intervals = int(24*60/interval) # compute number of intervals in 24 hour period
slide_amount = interval # amount to slide window (if no overlap desired, set equal to interval)

# Get all intervals in 24 hour period
interval_starts = [addTimes(start_time = start, duration = slide_amount * n) for n in range(n_intervals)]

In [7]:
# For given vendor, count the number of transactions in each time_bin
train_set['time_bin'] = train_set['date'].apply(binTime, start_times = interval_starts)

## Encoding data to numerical values

In [8]:
# Convert categorical variables into numerical classes
from sklearn.preprocessing import LabelEncoder

# Encode drugs
drug_le = LabelEncoder()
drug_le.fit(train_set['drug_prediction'])
train_drugs_encoded = drug_le.transform(train_set['drug_prediction'])

# Encode times
time_bin_le = LabelEncoder()
time_bin_le.fit(train_set['time_bin'])
train_times_encoded = time_bin_le.transform(train_set['time_bin'])

# Encode vendors
vendor_le = LabelEncoder()
vendor_le.fit(train_set['vendor_name'])
train_vendors_encoded = vendor_le.transform(train_set['vendor_name'])

# Create feature matrices and label vectors
X_train = pd.DataFrame({'drug':train_drugs_encoded, 'time_bin':train_times_encoded})
y_train = train_vendors_encoded

In [9]:
n = 10
print(list(train_set['drug_prediction'])[:n])
print(X_train['drug'][:n])
print(drug_le.inverse_transform(X_train['drug'][:n]))
print('\n\n')
print(list(train_set['time_bin'])[:n])
print(X_train['time_bin'][:n])
print(time_bin_le.inverse_transform(X_train['time_bin'][:n]))

print('\n\n')
print(list(train_set['vendor_name'])[:n])
print(y_train[:n])
print(vendor_le.inverse_transform(y_train[:n]))

['ketamine', 'ketamine', 'methamphetamine', 'heroin', 'ketamine', 'ketamine', 'cocaine', 'ketamine', 'dmt', 'ketamine']
0     8
1     8
2    11
3     7
4     8
5     8
6     3
7     8
8     4
9     8
Name: drug, dtype: int64
['ketamine' 'ketamine' 'methamphetamine' 'heroin' 'ketamine' 'ketamine'
 'cocaine' 'ketamine' 'dmt' 'ketamine']



[datetime.time(19, 10), datetime.time(19, 10), datetime.time(22, 15), datetime.time(2, 0), datetime.time(19, 10), datetime.time(19, 10), datetime.time(20, 40), datetime.time(1, 50), datetime.time(1, 55), datetime.time(19, 10)]
0     77
1     77
2    109
3      4
4     77
5     77
6     95
7      2
8      3
9     77
Name: time_bin, dtype: int64
[datetime.time(19, 10) datetime.time(19, 10) datetime.time(22, 15)
 datetime.time(2, 0) datetime.time(19, 10) datetime.time(19, 10)
 datetime.time(20, 40) datetime.time(1, 50) datetime.time(1, 55)
 datetime.time(19, 10)]



['hiddenhands', 'hiddenhands', 'drugpharmacist', 'trotters_stash', 'hiddenhands', 'hiddenh

## Model training

In [31]:
# X_train
# y_train

array([142, 142, 133, ...,  44,  84,  99])

In [56]:
from sklearn.naive_bayes import MultinomialNB

# Train Naïve-Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Evaluating the model

In [57]:
# Bin time values in the test set
test_set['time_bin'] = test_set['date'].apply(binTime, start_times = interval_starts)

# Encode drugs and times in test set
test_drugs_encoded = drug_le.transform(test_set['drug_prediction'])
test_times_encoded = time_bin_le.transform(test_set['time_bin'])

# Create feature matrix and vector of corresponding labels
X_test = pd.DataFrame({'drug':test_drugs_encoded, 'time_bin':test_times_encoded})
y_test = vendor_le.transform(test_set['vendor_name'])

In [58]:
# X_test
# y_test

array([ 27, 164, 164, ..., 170, 170,  23])

In [47]:
# Function to get list of indices with max value
def multi_argmax(v, n):
#     v is vector 
#     n is number of maximum indices to return
    max_indices = []
    temp = copy.deepcopy(v)
    while len(max_indices) < n:
        max_idx = np.argmax(temp)
        max_indices.append(max_idx)
        temp[max_idx] = -1
    return(max_indices)

In [48]:
# Function to get the top class predictions using model.predict_proba
def get_top_n(prob_vector, n, encoder = vendor_le):
    max_indices = multi_argmax(prob_vector, n)
    max_vendors = encoder.inverse_transform(max_indices)
    return(max_vendors)

In [49]:
# Get probability predictions for each transaction in the dataset
prob_ar = nb_model.predict_proba(X_test)
vendor_preds = np.apply_along_axis(get_top_n, 1, prob_ar, n = 15)

In [50]:
# Get accuracy
# Compute 'top n' accuracy
def get_top_n_acc(actuals, preds, n):
    q = zip(actuals, preds)
    if n == 1:
        ar = [samp[0] == samp[1][0] for samp in q]
    else:
        ar = [samp[0] in samp[1][:n] for samp in q]
    acc = np.sum(ar)/len(ar)
    return(round(acc,4))

In [51]:
print('Top 1 accuracy :',get_top_n_acc(vendor_le.inverse_transform(y_test), vendor_preds, 1))
print('Top 5 accuracy :',get_top_n_acc(vendor_le.inverse_transform(y_test), vendor_preds, 5))
print('Top 10 accuracy:',get_top_n_acc(vendor_le.inverse_transform(y_test), vendor_preds, 10))

Top 1 accuracy : 0.0224
Top 5 accuracy : 0.0821
Top 10 accuracy: 0.1754


In [52]:
vendor_preds2 = nb_model.predict(X_test)
df = pd.DataFrame({'actuals': vendor_le.inverse_transform(y_test),\
                   'preds': vendor_le.inverse_transform(vendor_preds2)})

In [53]:
print (nb_model.score(X_train, y_train))

0.034415346393516066
