In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress

import datetime
from dateutil.parser import parse

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

import math
import os
import copy
import pickle

# import drugLookup

In [2]:
# Read in data (from pickle file)
file = open('drug_and_price_train.pkl','rb')
train_set = pickle.load(file)

file = open('drug_and_price_test.pkl','rb')
test_set = pickle.load(file)


#Combined dataset (train and test)
file = open('drug_and_price.pkl','rb')
data = pickle.load(file)

In [3]:
test_set = test_set.drop(columns=['vendor_pred'])

In [4]:
data = data.drop(columns=['vendor_pred'])

In [5]:
test_set.head()

Unnamed: 0,transaction_id,vendor_id,vendor_name,bitcoin,USD,product_description,dates,drug_prediction,price_category
53463,'54561,453,DerSandmann,0,112.557,1g (91%+/-) COLUMBIA PLATIN Kokain/Cocaine Flakes,2018-12-01 22:15:22,cocaine,100.0
15112,'16210,907,pillonpourvousservir,0,70.819,10g KIA HASH prenium,2018-12-01 22:17:51,hashish,60.0
15154,'16252,907,pillonpourvousservir,0,70.819,10g KIA HASH prenium,2018-12-01 22:17:51,hashish,60.0
55014,'56112,48,PHARMALABS,0,39.52,USP Azithromycin 500 mg - 10 tablets,2018-12-01 22:27:33,others,20.0
15864,'16962,1197,Gladyman,0,7.28,Viagra 200MG Generic Cenforce,2018-12-01 22:29:13,others,0.0


In [121]:
# data

In [122]:
d_train = list(train_set['drug_prediction'])
v_train = list(train_set['vendor_name'])

d_test = list(test_set['drug_prediction'])
v_test = list(test_set['vendor_name'])

#### Encoding data to numerical values

In [123]:
# Bin time values in the test set
# train_set['time_bin'] = train_set['new_date'].apply(binTime, start_times = interval_starts)

interval = 20

# Get data for testing
X_train = train_set[['drug_prediction','price_category']]

vendor_le = preprocessing.LabelEncoder()
vendor_le.fit(train_set['vendor_name'])
y_train = vendor_le.transform(train_set['vendor_name'])

# One-hot encoding
enc = OneHotEncoder(categories = 'auto',handle_unknown = 'ignore')
enc.fit(X_train)
X_train = enc.transform(X_train)

In [107]:
data.head()

Unnamed: 0,transaction_id,vendor_id,vendor_name,bitcoin,USD,product_description,dates,drug_prediction,price_category
10946,'12044,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-03 19:14:02,ketamine,72.0
10911,'12009,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-03 19:14:02,ketamine,72.0
15416,'16514,438,drugpharmacist,0,145.6,Bomb Cartel Ice 7g - FREE PRIORITY SHIPPING,2018-04-03 22:19:03,methamphetamine,145.0
21600,'22698,962,trotters_stash,0,382.446,7g High Quality Pure #3 Heroin - Uncut,2018-04-04 02:00:13,heroin,382.0
10955,'12053,754,hiddenhands,0,72.532,KETAMINE 3G AAA+++ | OFFER | Next Day Delivery,2018-04-04 19:14:02,ketamine,72.0


#### Model Training

In [108]:
# train_set.head()

In [124]:
from sklearn.naive_bayes import MultinomialNB

# Train Naïve-Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Evaluating the Model

In [110]:
# test_set.head()

In [125]:
# Bin time values in the test set
# test_set[''] = test_set['new_date'].apply(binTime, start_times = interval_starts)

# Get data for testing
X_test = test_set[['drug_prediction','price_category']]
y_test = vendor_le.transform(test_set['vendor_name'])

# One-hot encoding
X_test = enc.transform(X_test)

In [126]:
# Function to get list of indices with max value
def multi_argmax(v, n):
#     v is vector 
#     n is number of maximum indices to return
    max_indices = []
    temp = copy.deepcopy(v)
    while len(max_indices) < n:
        max_idx = np.argmax(temp)
        max_indices.append(max_idx)
        temp[max_idx] = -1
    return(max_indices)

In [127]:
# Function to get the top class predictions using model.predict_proba
def get_top_n(prob_vector, n, encoder = vendor_le):
    max_indices = multi_argmax(prob_vector, n)
    max_vendors = encoder.inverse_transform(max_indices)
    return(max_vendors)

In [128]:
# Get probability predictions for each transaction in the dataset
prob_ar = nb_model.predict_proba(X_test)
vendor_preds = np.apply_along_axis(get_top_n, 1, prob_ar, n = 15)

In [129]:
# Get accuracy
# Compute 'top n' accuracy
def get_top_n_acc(actuals, preds, n):
    q = zip(actuals, preds)
    if n == 1:
        ar = [samp[0] == samp[1][0] for samp in q]
    else:
        ar = [samp[0] in samp[1][:n] for samp in q]
    acc = np.sum(ar)/len(ar)
    return(round(acc,4))

In [130]:
# Naïve-Bayes, sklearn implementation
print('sklearn Naïve-Bayes,',interval,'dollar:')
print('Top 1 accuracy :',get_top_n_acc(vendor_le.inverse_transform(y_test), vendor_preds, 1))
print('Top 5 accuracy :',get_top_n_acc(vendor_le.inverse_transform(y_test), vendor_preds, 5))
print('Top 10 accuracy:',get_top_n_acc(vendor_le.inverse_transform(y_test), vendor_preds, 10))

sklearn Naïve-Bayes, 20 dollar:
Top 1 accuracy : 0.222
Top 5 accuracy : 0.6021
Top 10 accuracy: 0.8071
